mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 02:15:43 +03:00
Merge pull request #10796 from hashicorp/external-nvidia
devices: externalize nvidia device driver
This commit is contained in:
@@ -625,13 +625,9 @@ workflows:
|
||||
test_module: "api"
|
||||
filters: *backend_test_branches_filter
|
||||
enable_race_testing: true
|
||||
- test-container:
|
||||
name: "test-devices"
|
||||
test_packages: "./devices/..."
|
||||
filters: *backend_test_branches_filter
|
||||
- test-machine:
|
||||
name: "test-other"
|
||||
exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./devices|./e2e"
|
||||
exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./e2e"
|
||||
filters: *backend_test_branches_filter
|
||||
- test-machine:
|
||||
name: "test-docker"
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
This package provides an implementation of nvidia device plugin
|
||||
|
||||
# Behavior
|
||||
|
||||
Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period.
|
||||
|
||||
# Config
|
||||
|
||||
The configuration should be passed via an HCL file that begins with a top level `config` stanza:
|
||||
|
||||
```
|
||||
config {
|
||||
ignored_gpu_ids = ["uuid1", "uuid2"]
|
||||
fingerprint_period = "5s"
|
||||
}
|
||||
```
|
||||
|
||||
The valid configuration options are:
|
||||
|
||||
* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
|
||||
* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes.
|
||||
@@ -1,20 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia"
|
||||
"github.com/hashicorp/nomad/plugins"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Serve the plugin
|
||||
plugins.ServeCtx(factory)
|
||||
}
|
||||
|
||||
// factory returns a new instance of the Nvidia GPU plugin
|
||||
func factory(ctx context.Context, log log.Logger) interface{} {
|
||||
return nvidia.NewNvidiaDevice(ctx, log)
|
||||
}
|
||||
@@ -1,228 +0,0 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/helper/pluginutils/loader"
|
||||
"github.com/hashicorp/nomad/plugins/base"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/shared/hclspec"
|
||||
)
|
||||
|
||||
const (
|
||||
// pluginName is the name of the plugin
|
||||
pluginName = "nvidia-gpu"
|
||||
|
||||
// vendor is the vendor providing the devices
|
||||
vendor = "nvidia"
|
||||
|
||||
// deviceType is the type of device being returned
|
||||
deviceType = device.DeviceTypeGPU
|
||||
|
||||
// notAvailable value is returned to nomad server in case some properties were
|
||||
// undetected by nvml driver
|
||||
notAvailable = "N/A"
|
||||
|
||||
// Nvidia-container-runtime environment variable names
|
||||
NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||
)
|
||||
|
||||
var (
|
||||
// PluginID is the nvidia plugin metadata registered in the plugin
|
||||
// catalog.
|
||||
PluginID = loader.PluginID{
|
||||
Name: pluginName,
|
||||
PluginType: base.PluginTypeDevice,
|
||||
}
|
||||
|
||||
// PluginConfig is the nvidia factory function registered in the
|
||||
// plugin catalog.
|
||||
PluginConfig = &loader.InternalPluginConfig{
|
||||
Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) },
|
||||
}
|
||||
|
||||
// pluginInfo describes the plugin
|
||||
pluginInfo = &base.PluginInfoResponse{
|
||||
Type: base.PluginTypeDevice,
|
||||
PluginApiVersions: []string{device.ApiVersion010},
|
||||
PluginVersion: "0.1.0",
|
||||
Name: pluginName,
|
||||
}
|
||||
|
||||
// configSpec is the specification of the plugin's configuration
|
||||
configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
|
||||
"enabled": hclspec.NewDefault(
|
||||
hclspec.NewAttr("enabled", "bool", false),
|
||||
hclspec.NewLiteral("true"),
|
||||
),
|
||||
"ignored_gpu_ids": hclspec.NewDefault(
|
||||
hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
|
||||
hclspec.NewLiteral("[]"),
|
||||
),
|
||||
"fingerprint_period": hclspec.NewDefault(
|
||||
hclspec.NewAttr("fingerprint_period", "string", false),
|
||||
hclspec.NewLiteral("\"1m\""),
|
||||
),
|
||||
})
|
||||
)
|
||||
|
||||
// Config contains configuration information for the plugin.
|
||||
type Config struct {
|
||||
Enabled bool `codec:"enabled"`
|
||||
IgnoredGPUIDs []string `codec:"ignored_gpu_ids"`
|
||||
FingerprintPeriod string `codec:"fingerprint_period"`
|
||||
}
|
||||
|
||||
// NvidiaDevice contains all plugin specific data
|
||||
type NvidiaDevice struct {
|
||||
// enabled indicates whether the plugin should be enabled
|
||||
enabled bool
|
||||
|
||||
// nvmlClient is used to get data from nvidia
|
||||
nvmlClient nvml.NvmlClient
|
||||
|
||||
// initErr holds an error retrieved during
|
||||
// nvmlClient initialization
|
||||
initErr error
|
||||
|
||||
// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
|
||||
ignoredGPUIDs map[string]struct{}
|
||||
|
||||
// fingerprintPeriod is how often we should call nvml to get list of devices
|
||||
fingerprintPeriod time.Duration
|
||||
|
||||
// devices is the set of detected eligible devices
|
||||
devices map[string]struct{}
|
||||
deviceLock sync.RWMutex
|
||||
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
// NewNvidiaDevice returns a new nvidia device plugin.
|
||||
func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice {
|
||||
nvmlClient, err := nvml.NewNvmlClient()
|
||||
logger := log.Named(pluginName)
|
||||
if err != nil && err.Error() != nvml.UnavailableLib.Error() {
|
||||
logger.Error("unable to initialize Nvidia driver", "reason", err)
|
||||
}
|
||||
return &NvidiaDevice{
|
||||
logger: logger,
|
||||
devices: make(map[string]struct{}),
|
||||
ignoredGPUIDs: make(map[string]struct{}),
|
||||
nvmlClient: nvmlClient,
|
||||
initErr: err,
|
||||
}
|
||||
}
|
||||
|
||||
// PluginInfo returns information describing the plugin.
|
||||
func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
|
||||
return pluginInfo, nil
|
||||
}
|
||||
|
||||
// ConfigSchema returns the plugins configuration schema.
|
||||
func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
|
||||
return configSpec, nil
|
||||
}
|
||||
|
||||
// SetConfig is used to set the configuration of the plugin.
|
||||
func (d *NvidiaDevice) SetConfig(cfg *base.Config) error {
|
||||
var config Config
|
||||
if len(cfg.PluginConfig) != 0 {
|
||||
if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
d.enabled = config.Enabled
|
||||
|
||||
for _, ignoredGPUId := range config.IgnoredGPUIDs {
|
||||
d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
|
||||
}
|
||||
|
||||
period, err := time.ParseDuration(config.FingerprintPeriod)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
|
||||
}
|
||||
d.fingerprintPeriod = period
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fingerprint streams detected devices. If device changes are detected or the
|
||||
// devices health changes, messages will be emitted.
|
||||
func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
|
||||
if !d.enabled {
|
||||
return nil, device.ErrPluginDisabled
|
||||
}
|
||||
|
||||
outCh := make(chan *device.FingerprintResponse)
|
||||
go d.fingerprint(ctx, outCh)
|
||||
return outCh, nil
|
||||
}
|
||||
|
||||
type reservationError struct {
|
||||
notExistingIDs []string
|
||||
}
|
||||
|
||||
func (e *reservationError) Error() string {
|
||||
return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
|
||||
}
|
||||
|
||||
// Reserve returns information on how to mount given devices.
|
||||
// Assumption is made that nomad server is responsible for correctness of
|
||||
// GPU allocations, handling tricky cases such as double-allocation of single GPU
|
||||
func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
|
||||
if len(deviceIDs) == 0 {
|
||||
return &device.ContainerReservation{}, nil
|
||||
}
|
||||
if !d.enabled {
|
||||
return nil, device.ErrPluginDisabled
|
||||
}
|
||||
|
||||
// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
|
||||
// of race condition
|
||||
//
|
||||
// Timeline:
|
||||
// 1 - fingerprint reports that GPU with id "1" is present
|
||||
// 2 - the following events happen at the same time:
|
||||
// a) server decides to allocate GPU with id "1"
|
||||
// b) fingerprint check reports that GPU with id "1" is no more present
|
||||
//
|
||||
// The latest and always valid version of fingerprinted ids are stored in
|
||||
// d.devices map. To avoid this race condition an error is returned if
|
||||
// any of provided deviceIDs is not found in d.devices map
|
||||
d.deviceLock.RLock()
|
||||
var notExistingIDs []string
|
||||
for _, id := range deviceIDs {
|
||||
if _, deviceIDExists := d.devices[id]; !deviceIDExists {
|
||||
notExistingIDs = append(notExistingIDs, id)
|
||||
}
|
||||
}
|
||||
d.deviceLock.RUnlock()
|
||||
if len(notExistingIDs) != 0 {
|
||||
return nil, &reservationError{notExistingIDs}
|
||||
}
|
||||
|
||||
return &device.ContainerReservation{
|
||||
Envs: map[string]string{
|
||||
NvidiaVisibleDevices: strings.Join(deviceIDs, ","),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Stats streams statistics for the detected devices.
|
||||
func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) {
|
||||
if !d.enabled {
|
||||
return nil, device.ErrPluginDisabled
|
||||
}
|
||||
|
||||
outCh := make(chan *device.StatsResponse)
|
||||
go d.stats(ctx, outCh, interval)
|
||||
return outCh, nil
|
||||
}
|
||||
@@ -1,140 +0,0 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type MockNvmlClient struct {
|
||||
FingerprintError error
|
||||
FingerprintResponseReturned *nvml.FingerprintData
|
||||
|
||||
StatsError error
|
||||
StatsResponseReturned []*nvml.StatsData
|
||||
}
|
||||
|
||||
func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) {
|
||||
return c.FingerprintResponseReturned, c.FingerprintError
|
||||
}
|
||||
|
||||
func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
|
||||
return c.StatsResponseReturned, c.StatsError
|
||||
}
|
||||
|
||||
func TestReserve(t *testing.T) {
|
||||
cases := []struct {
|
||||
Name string
|
||||
ExpectedReservation *device.ContainerReservation
|
||||
ExpectedError error
|
||||
Device *NvidiaDevice
|
||||
RequestedIDs []string
|
||||
}{
|
||||
{
|
||||
Name: "All RequestedIDs are not managed by Device",
|
||||
ExpectedReservation: nil,
|
||||
ExpectedError: &reservationError{[]string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
}},
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "Some RequestedIDs are not managed by Device",
|
||||
ExpectedReservation: nil,
|
||||
ExpectedError: &reservationError{[]string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
}},
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "All RequestedIDs are managed by Device",
|
||||
ExpectedReservation: &device.ContainerReservation{
|
||||
Envs: map[string]string{
|
||||
NvidiaVisibleDevices: "UUID1,UUID2,UUID3",
|
||||
},
|
||||
},
|
||||
ExpectedError: nil,
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID1": {},
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "No IDs requested",
|
||||
ExpectedReservation: &device.ContainerReservation{},
|
||||
ExpectedError: nil,
|
||||
RequestedIDs: nil,
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID1": {},
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "Device is disabled",
|
||||
ExpectedReservation: nil,
|
||||
ExpectedError: device.ErrPluginDisabled,
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID1": {},
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
enabled: false,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.Name, func(t *testing.T) {
|
||||
actualReservation, actualError := c.Device.Reserve(c.RequestedIDs)
|
||||
require.Equal(t, c.ExpectedReservation, actualReservation)
|
||||
require.Equal(t, c.ExpectedError, actualError)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,229 +0,0 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
)
|
||||
|
||||
const (
|
||||
// Attribute names and units for reporting Fingerprint output
|
||||
MemoryAttr = "memory"
|
||||
PowerAttr = "power"
|
||||
BAR1Attr = "bar1"
|
||||
DriverVersionAttr = "driver_version"
|
||||
CoresClockAttr = "cores_clock"
|
||||
MemoryClockAttr = "memory_clock"
|
||||
PCIBandwidthAttr = "pci_bandwidth"
|
||||
DisplayStateAttr = "display_state"
|
||||
PersistenceModeAttr = "persistence_mode"
|
||||
)
|
||||
|
||||
// fingerprint is the long running goroutine that detects hardware
|
||||
func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
|
||||
defer close(devices)
|
||||
|
||||
if d.initErr != nil {
|
||||
if d.initErr.Error() != nvml.UnavailableLib.Error() {
|
||||
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr)
|
||||
devices <- device.NewFingerprintError(d.initErr)
|
||||
}
|
||||
|
||||
// Just close the channel to let server know that there are no working
|
||||
// Nvidia GPU units
|
||||
return
|
||||
}
|
||||
|
||||
// Create a timer that will fire immediately for the first detection
|
||||
ticker := time.NewTimer(0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
ticker.Reset(d.fingerprintPeriod)
|
||||
}
|
||||
d.writeFingerprintToChannel(devices)
|
||||
}
|
||||
}
|
||||
|
||||
// writeFingerprintToChannel makes nvml call and writes response to channel
|
||||
func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
|
||||
fingerprintData, err := d.nvmlClient.GetFingerprintData()
|
||||
if err != nil {
|
||||
d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
|
||||
devices <- device.NewFingerprintError(err)
|
||||
return
|
||||
}
|
||||
|
||||
// ignore devices from fingerprint output
|
||||
fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
|
||||
// check if any device health was updated or any device was added to host
|
||||
if !d.fingerprintChanged(fingerprintDevices) {
|
||||
return
|
||||
}
|
||||
|
||||
commonAttributes := map[string]*structs.Attribute{
|
||||
DriverVersionAttr: {
|
||||
String: helper.StringToPtr(fingerprintData.DriverVersion),
|
||||
},
|
||||
}
|
||||
|
||||
// Group all FingerprintDevices by DeviceName attribute
|
||||
deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
|
||||
for _, device := range fingerprintDevices {
|
||||
deviceName := device.DeviceName
|
||||
if deviceName == nil {
|
||||
// nvml driver was not able to detect device name. This kind
|
||||
// of devices are placed to single group with 'notAvailable' name
|
||||
notAvailableCopy := notAvailable
|
||||
deviceName = ¬AvailableCopy
|
||||
}
|
||||
|
||||
deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
|
||||
}
|
||||
|
||||
// Build Fingerprint response with computed groups and send it over the channel
|
||||
deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
|
||||
for groupName, devices := range deviceListByDeviceName {
|
||||
deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
|
||||
}
|
||||
devices <- device.NewFingerprint(deviceGroups...)
|
||||
}
|
||||
|
||||
// ignoreFingerprintedDevices excludes ignored devices from fingerprint output
|
||||
func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
|
||||
var result []*nvml.FingerprintDeviceData
|
||||
for _, fingerprintDevice := range deviceData {
|
||||
if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
|
||||
result = append(result, fingerprintDevice)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// fingerprintChanged checks if there are any previously unseen nvidia devices located
|
||||
// or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
|
||||
// Also, this func updates device map on NvidiaDevice with the latest data
|
||||
func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
|
||||
d.deviceLock.Lock()
|
||||
defer d.deviceLock.Unlock()
|
||||
|
||||
changeDetected := false
|
||||
// check if every device in allDevices is in d.devices
|
||||
for _, device := range allDevices {
|
||||
if _, ok := d.devices[device.UUID]; !ok {
|
||||
changeDetected = true
|
||||
}
|
||||
}
|
||||
|
||||
// check if every device in d.devices is in allDevices
|
||||
fingerprintDeviceMap := make(map[string]struct{})
|
||||
for _, device := range allDevices {
|
||||
fingerprintDeviceMap[device.UUID] = struct{}{}
|
||||
}
|
||||
for id := range d.devices {
|
||||
if _, ok := fingerprintDeviceMap[id]; !ok {
|
||||
changeDetected = true
|
||||
}
|
||||
}
|
||||
|
||||
d.devices = fingerprintDeviceMap
|
||||
return changeDetected
|
||||
}
|
||||
|
||||
// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
|
||||
func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup {
|
||||
// deviceGroup without devices makes no sense -> return nil when no devices are provided
|
||||
if len(deviceList) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
devices := make([]*device.Device, len(deviceList))
|
||||
for index, dev := range deviceList {
|
||||
devices[index] = &device.Device{
|
||||
ID: dev.UUID,
|
||||
// all fingerprinted devices are "healthy" for now
|
||||
// to get real health data -> dcgm bindings should be used
|
||||
Healthy: true,
|
||||
HwLocality: &device.DeviceLocality{
|
||||
PciBusID: dev.PCIBusID,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
deviceGroup := &device.DeviceGroup{
|
||||
Vendor: vendor,
|
||||
Type: deviceType,
|
||||
Name: groupName,
|
||||
Devices: devices,
|
||||
// Assumption made that devices with the same DeviceName have the same
|
||||
// attributes like amount of memory, power, bar1memory etc
|
||||
Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
|
||||
}
|
||||
|
||||
// Extend attribute map with common attributes
|
||||
for attributeKey, attributeValue := range commonAttributes {
|
||||
deviceGroup.Attributes[attributeKey] = attributeValue
|
||||
}
|
||||
|
||||
return deviceGroup
|
||||
}
|
||||
|
||||
// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
|
||||
// struct to device.DeviceGroup.Attributes format (map[string]string)
|
||||
// this function performs all nil checks for FingerprintDeviceData pointers
|
||||
func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute {
|
||||
attrs := map[string]*structs.Attribute{
|
||||
DisplayStateAttr: {
|
||||
String: helper.StringToPtr(d.DisplayState),
|
||||
},
|
||||
PersistenceModeAttr: {
|
||||
String: helper.StringToPtr(d.PersistenceMode),
|
||||
},
|
||||
}
|
||||
|
||||
if d.MemoryMiB != nil {
|
||||
attrs[MemoryAttr] = &structs.Attribute{
|
||||
Int: helper.Int64ToPtr(int64(*d.MemoryMiB)),
|
||||
Unit: structs.UnitMiB,
|
||||
}
|
||||
}
|
||||
if d.PowerW != nil {
|
||||
attrs[PowerAttr] = &structs.Attribute{
|
||||
Int: helper.Int64ToPtr(int64(*d.PowerW)),
|
||||
Unit: structs.UnitW,
|
||||
}
|
||||
}
|
||||
if d.BAR1MiB != nil {
|
||||
attrs[BAR1Attr] = &structs.Attribute{
|
||||
Int: helper.Int64ToPtr(int64(*d.BAR1MiB)),
|
||||
Unit: structs.UnitMiB,
|
||||
}
|
||||
}
|
||||
if d.CoresClockMHz != nil {
|
||||
attrs[CoresClockAttr] = &structs.Attribute{
|
||||
Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)),
|
||||
Unit: structs.UnitMHz,
|
||||
}
|
||||
}
|
||||
if d.MemoryClockMHz != nil {
|
||||
attrs[MemoryClockAttr] = &structs.Attribute{
|
||||
Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)),
|
||||
Unit: structs.UnitMHz,
|
||||
}
|
||||
}
|
||||
if d.PCIBandwidthMBPerS != nil {
|
||||
attrs[PCIBandwidthAttr] = &structs.Attribute{
|
||||
Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)),
|
||||
Unit: structs.UnitMBPerS,
|
||||
}
|
||||
}
|
||||
|
||||
return attrs
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,194 +0,0 @@
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// DeviceData represents common fields for Nvidia device
|
||||
type DeviceData struct {
|
||||
UUID string
|
||||
DeviceName *string
|
||||
MemoryMiB *uint64
|
||||
PowerW *uint
|
||||
BAR1MiB *uint64
|
||||
}
|
||||
|
||||
// FingerprintDeviceData is a superset of DeviceData
|
||||
// it describes device specific fields returned from
|
||||
// nvml queries during fingerprinting call
|
||||
type FingerprintDeviceData struct {
|
||||
*DeviceData
|
||||
PCIBandwidthMBPerS *uint
|
||||
CoresClockMHz *uint
|
||||
MemoryClockMHz *uint
|
||||
DisplayState string
|
||||
PersistenceMode string
|
||||
PCIBusID string
|
||||
}
|
||||
|
||||
// FingerprintData represets attributes of driver/devices
|
||||
type FingerprintData struct {
|
||||
Devices []*FingerprintDeviceData
|
||||
DriverVersion string
|
||||
}
|
||||
|
||||
// StatsData is a superset of DeviceData
|
||||
// it represents statistics data returned for every Nvidia device
|
||||
type StatsData struct {
|
||||
*DeviceData
|
||||
PowerUsageW *uint
|
||||
GPUUtilization *uint
|
||||
MemoryUtilization *uint
|
||||
EncoderUtilization *uint
|
||||
DecoderUtilization *uint
|
||||
TemperatureC *uint
|
||||
UsedMemoryMiB *uint64
|
||||
BAR1UsedMiB *uint64
|
||||
ECCErrorsL1Cache *uint64
|
||||
ECCErrorsL2Cache *uint64
|
||||
ECCErrorsDevice *uint64
|
||||
}
|
||||
|
||||
// NvmlClient describes how users would use nvml library
|
||||
type NvmlClient interface {
|
||||
GetFingerprintData() (*FingerprintData, error)
|
||||
GetStatsData() ([]*StatsData, error)
|
||||
}
|
||||
|
||||
// nvmlClient implements NvmlClient
|
||||
// Users of this lib are expected to use this struct via NewNvmlClient func
|
||||
type nvmlClient struct {
|
||||
driver NvmlDriver
|
||||
}
|
||||
|
||||
// NewNvmlClient function creates new nvmlClient with real
|
||||
// NvmlDriver implementation. Also, this func initializes NvmlDriver
|
||||
func NewNvmlClient() (*nvmlClient, error) {
|
||||
driver := &nvmlDriver{}
|
||||
err := driver.Initialize()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &nvmlClient{
|
||||
driver: driver,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetFingerprintData returns FingerprintData for available Nvidia devices
|
||||
func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
|
||||
/*
|
||||
nvml fields to be fingerprinted # nvml_library_call
|
||||
1 - Driver Version # nvmlSystemGetDriverVersion
|
||||
2 - Product Name # nvmlDeviceGetName
|
||||
3 - GPU UUID # nvmlDeviceGetUUID
|
||||
4 - Total Memory # nvmlDeviceGetMemoryInfo
|
||||
5 - Power # nvmlDeviceGetPowerManagementLimit
|
||||
6 - PCIBusID # nvmlDeviceGetPciInfo
|
||||
7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo(
|
||||
8 - PCI Bandwidth
|
||||
9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo
|
||||
10 - Display Mode # nvmlDeviceGetDisplayMode
|
||||
11 - Persistence Mode # nvmlDeviceGetPersistenceMode
|
||||
*/
|
||||
|
||||
// Assumed that this method is called with receiver retrieved from
|
||||
// NewNvmlClient
|
||||
// because this method handles initialization of NVML library
|
||||
|
||||
driverVersion, err := c.driver.SystemDriverVersion()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
|
||||
}
|
||||
|
||||
numDevices, err := c.driver.DeviceCount()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
|
||||
|
||||
for i := 0; i < int(numDevices); i++ {
|
||||
deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUResources[i] = &FingerprintDeviceData{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: deviceInfo.Name,
|
||||
UUID: deviceInfo.UUID,
|
||||
MemoryMiB: deviceInfo.MemoryMiB,
|
||||
PowerW: deviceInfo.PowerW,
|
||||
BAR1MiB: deviceInfo.BAR1MiB,
|
||||
},
|
||||
PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
|
||||
CoresClockMHz: deviceInfo.CoresClockMHz,
|
||||
MemoryClockMHz: deviceInfo.MemoryClockMHz,
|
||||
DisplayState: deviceInfo.DisplayState,
|
||||
PersistenceMode: deviceInfo.PersistenceMode,
|
||||
PCIBusID: deviceInfo.PCIBusID,
|
||||
}
|
||||
}
|
||||
return &FingerprintData{
|
||||
Devices: allNvidiaGPUResources,
|
||||
DriverVersion: driverVersion,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetStatsData returns statistics data for all devices on this machine
|
||||
func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
|
||||
/*
|
||||
nvml fields to be reported to stats api # nvml_library_call
|
||||
1 - Used Memory # nvmlDeviceGetMemoryInfo
|
||||
2 - Utilization of GPU # nvmlDeviceGetUtilizationRates
|
||||
3 - Utilization of Memory # nvmlDeviceGetUtilizationRates
|
||||
4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization
|
||||
5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization
|
||||
6 - Current GPU Temperature # nvmlDeviceGetTemperature
|
||||
7 - Power Draw # nvmlDeviceGetPowerUsage
|
||||
8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo
|
||||
9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter
|
||||
10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter
|
||||
11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
|
||||
*/
|
||||
|
||||
// Assumed that this method is called with receiver retrieved from
|
||||
// NewNvmlClient
|
||||
// because this method handles initialization of NVML library
|
||||
|
||||
numDevices, err := c.driver.DeviceCount()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUStats := make([]*StatsData, numDevices)
|
||||
|
||||
for i := 0; i < int(numDevices); i++ {
|
||||
deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUStats[i] = &StatsData{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: deviceInfo.Name,
|
||||
UUID: deviceInfo.UUID,
|
||||
MemoryMiB: deviceInfo.MemoryMiB,
|
||||
PowerW: deviceInfo.PowerW,
|
||||
BAR1MiB: deviceInfo.BAR1MiB,
|
||||
},
|
||||
PowerUsageW: deviceStatus.PowerUsageW,
|
||||
GPUUtilization: deviceStatus.GPUUtilization,
|
||||
MemoryUtilization: deviceStatus.MemoryUtilization,
|
||||
EncoderUtilization: deviceStatus.EncoderUtilization,
|
||||
DecoderUtilization: deviceStatus.DecoderUtilization,
|
||||
TemperatureC: deviceStatus.TemperatureC,
|
||||
UsedMemoryMiB: deviceStatus.UsedMemoryMiB,
|
||||
BAR1UsedMiB: deviceStatus.BAR1UsedMiB,
|
||||
ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache,
|
||||
ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache,
|
||||
ECCErrorsDevice: deviceStatus.ECCErrorsDevice,
|
||||
}
|
||||
}
|
||||
return allNvidiaGPUStats, nil
|
||||
}
|
||||
@@ -1,399 +0,0 @@
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type MockNVMLDriver struct {
|
||||
systemDriverCallSuccessful bool
|
||||
deviceCountCallSuccessful bool
|
||||
deviceInfoByIndexCallSuccessful bool
|
||||
deviceInfoAndStatusByIndexCallSuccessful bool
|
||||
driverVersion string
|
||||
devices []*DeviceInfo
|
||||
deviceStatus []*DeviceStatus
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) Initialize() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) Shutdown() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) SystemDriverVersion() (string, error) {
|
||||
if !m.systemDriverCallSuccessful {
|
||||
return "", errors.New("failed to get system driver")
|
||||
}
|
||||
return m.driverVersion, nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) DeviceCount() (uint, error) {
|
||||
if !m.deviceCountCallSuccessful {
|
||||
return 0, errors.New("failed to get device length")
|
||||
}
|
||||
return uint(len(m.devices)), nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
|
||||
if index >= uint(len(m.devices)) {
|
||||
return nil, errors.New("index is out of range")
|
||||
}
|
||||
if !m.deviceInfoByIndexCallSuccessful {
|
||||
return nil, errors.New("failed to get device info by index")
|
||||
}
|
||||
return m.devices[index], nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
|
||||
if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) {
|
||||
return nil, nil, errors.New("index is out of range")
|
||||
}
|
||||
if !m.deviceInfoAndStatusByIndexCallSuccessful {
|
||||
return nil, nil, errors.New("failed to get device info and status by index")
|
||||
}
|
||||
return m.devices[index], m.deviceStatus[index], nil
|
||||
}
|
||||
|
||||
func TestGetFingerprintDataFromNVML(t *testing.T) {
|
||||
for _, testCase := range []struct {
|
||||
Name string
|
||||
DriverConfiguration *MockNVMLDriver
|
||||
ExpectedError bool
|
||||
ExpectedResult *FingerprintData
|
||||
}{
|
||||
{
|
||||
Name: "fail on systemDriverCallSuccessful",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: false,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "fail on deviceCountCallSuccessful",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: false,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "fail on deviceInfoByIndexCall",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: false,
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "successful outcome",
|
||||
ExpectedError: false,
|
||||
ExpectedResult: &FingerprintData{
|
||||
DriverVersion: "driverVersion",
|
||||
Devices: []*FingerprintDeviceData{
|
||||
{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName1"),
|
||||
UUID: "UUID1",
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
},
|
||||
PCIBusID: "busId1",
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
}, {
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName2"),
|
||||
UUID: "UUID2",
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
},
|
||||
PCIBusID: "busId2",
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
},
|
||||
},
|
||||
},
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
driverVersion: "driverVersion",
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId1",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId2",
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
} {
|
||||
cli := nvmlClient{driver: testCase.DriverConfiguration}
|
||||
fingerprintData, err := cli.GetFingerprintData()
|
||||
if testCase.ExpectedError && err == nil {
|
||||
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
|
||||
}
|
||||
if !testCase.ExpectedError && err != nil {
|
||||
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
|
||||
}
|
||||
require.New(t).Equal(testCase.ExpectedResult, fingerprintData)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetStatsDataFromNVML(t *testing.T) {
|
||||
for _, testCase := range []struct {
|
||||
Name string
|
||||
DriverConfiguration *MockNVMLDriver
|
||||
ExpectedError bool
|
||||
ExpectedResult []*StatsData
|
||||
}{
|
||||
{
|
||||
Name: "fail on deviceCountCallSuccessful",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: false,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
deviceInfoAndStatusByIndexCallSuccessful: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "fail on DeviceInfoAndStatusByIndex call",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoAndStatusByIndexCallSuccessful: false,
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId1",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId2",
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
},
|
||||
},
|
||||
deviceStatus: []*DeviceStatus{
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(1),
|
||||
GPUUtilization: helper.UintToPtr(1),
|
||||
MemoryUtilization: helper.UintToPtr(1),
|
||||
EncoderUtilization: helper.UintToPtr(1),
|
||||
DecoderUtilization: helper.UintToPtr(1),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(1),
|
||||
PowerUsageW: helper.UintToPtr(1),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(1),
|
||||
},
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(2),
|
||||
GPUUtilization: helper.UintToPtr(2),
|
||||
MemoryUtilization: helper.UintToPtr(2),
|
||||
EncoderUtilization: helper.UintToPtr(2),
|
||||
DecoderUtilization: helper.UintToPtr(2),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(2),
|
||||
PowerUsageW: helper.UintToPtr(2),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(2),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "successful outcome",
|
||||
ExpectedError: false,
|
||||
ExpectedResult: []*StatsData{
|
||||
{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName1"),
|
||||
UUID: "UUID1",
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
},
|
||||
TemperatureC: helper.UintToPtr(1),
|
||||
GPUUtilization: helper.UintToPtr(1),
|
||||
MemoryUtilization: helper.UintToPtr(1),
|
||||
EncoderUtilization: helper.UintToPtr(1),
|
||||
DecoderUtilization: helper.UintToPtr(1),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(1),
|
||||
PowerUsageW: helper.UintToPtr(1),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(1),
|
||||
},
|
||||
{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName2"),
|
||||
UUID: "UUID2",
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
},
|
||||
TemperatureC: helper.UintToPtr(2),
|
||||
GPUUtilization: helper.UintToPtr(2),
|
||||
MemoryUtilization: helper.UintToPtr(2),
|
||||
EncoderUtilization: helper.UintToPtr(2),
|
||||
DecoderUtilization: helper.UintToPtr(2),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(2),
|
||||
PowerUsageW: helper.UintToPtr(2),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(2),
|
||||
},
|
||||
},
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
deviceInfoAndStatusByIndexCallSuccessful: true,
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId1",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId2",
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
},
|
||||
},
|
||||
deviceStatus: []*DeviceStatus{
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(1),
|
||||
GPUUtilization: helper.UintToPtr(1),
|
||||
MemoryUtilization: helper.UintToPtr(1),
|
||||
EncoderUtilization: helper.UintToPtr(1),
|
||||
DecoderUtilization: helper.UintToPtr(1),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(1),
|
||||
PowerUsageW: helper.UintToPtr(1),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(1),
|
||||
},
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(2),
|
||||
GPUUtilization: helper.UintToPtr(2),
|
||||
MemoryUtilization: helper.UintToPtr(2),
|
||||
EncoderUtilization: helper.UintToPtr(2),
|
||||
DecoderUtilization: helper.UintToPtr(2),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(2),
|
||||
PowerUsageW: helper.UintToPtr(2),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(2),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
} {
|
||||
cli := nvmlClient{driver: testCase.DriverConfiguration}
|
||||
statsData, err := cli.GetStatsData()
|
||||
if testCase.ExpectedError && err == nil {
|
||||
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
|
||||
}
|
||||
if !testCase.ExpectedError && err != nil {
|
||||
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
|
||||
}
|
||||
require.New(t).Equal(testCase.ExpectedResult, statsData)
|
||||
}
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
// +build !linux
|
||||
|
||||
package nvml
|
||||
|
||||
// Initialize nvml library by locating nvml shared object file and calling ldopen
|
||||
func (n *nvmlDriver) Initialize() error {
|
||||
return UnavailableLib
|
||||
}
|
||||
|
||||
// Shutdown stops any further interaction with nvml
|
||||
func (n *nvmlDriver) Shutdown() error {
|
||||
return UnavailableLib
|
||||
}
|
||||
|
||||
// SystemDriverVersion returns installed driver version
|
||||
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
|
||||
return "", UnavailableLib
|
||||
}
|
||||
|
||||
// DeviceCount reports number of available GPU devices
|
||||
func (n *nvmlDriver) DeviceCount() (uint, error) {
|
||||
return 0, UnavailableLib
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
|
||||
return nil, UnavailableLib
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
|
||||
return nil, nil, UnavailableLib
|
||||
}
|
||||
@@ -1,85 +0,0 @@
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
|
||||
)
|
||||
|
||||
// Initialize nvml library by locating nvml shared object file and calling ldopen
|
||||
func (n *nvmlDriver) Initialize() error {
|
||||
return nvml.Init()
|
||||
}
|
||||
|
||||
// Shutdown stops any further interaction with nvml
|
||||
func (n *nvmlDriver) Shutdown() error {
|
||||
return nvml.Shutdown()
|
||||
}
|
||||
|
||||
// SystemDriverVersion returns installed driver version
|
||||
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
|
||||
return nvml.GetDriverVersion()
|
||||
}
|
||||
|
||||
// DeviceCount reports number of available GPU devices
|
||||
func (n *nvmlDriver) DeviceCount() (uint, error) {
|
||||
return nvml.GetDeviceCount()
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
|
||||
device, err := nvml.NewDevice(index)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
deviceMode, err := device.GetDeviceMode()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &DeviceInfo{
|
||||
UUID: device.UUID,
|
||||
Name: device.Model,
|
||||
MemoryMiB: device.Memory,
|
||||
PowerW: device.Power,
|
||||
BAR1MiB: device.PCI.BAR1,
|
||||
PCIBandwidthMBPerS: device.PCI.Bandwidth,
|
||||
PCIBusID: device.PCI.BusID,
|
||||
CoresClockMHz: device.Clocks.Cores,
|
||||
MemoryClockMHz: device.Clocks.Memory,
|
||||
DisplayState: deviceMode.DisplayInfo.Mode.String(),
|
||||
PersistenceMode: deviceMode.Persistence.String(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
|
||||
device, err := nvml.NewDevice(index)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
status, err := device.Status()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return &DeviceInfo{
|
||||
UUID: device.UUID,
|
||||
Name: device.Model,
|
||||
MemoryMiB: device.Memory,
|
||||
PowerW: device.Power,
|
||||
BAR1MiB: device.PCI.BAR1,
|
||||
PCIBandwidthMBPerS: device.PCI.Bandwidth,
|
||||
PCIBusID: device.PCI.BusID,
|
||||
CoresClockMHz: device.Clocks.Cores,
|
||||
MemoryClockMHz: device.Clocks.Memory,
|
||||
}, &DeviceStatus{
|
||||
TemperatureC: status.Temperature,
|
||||
GPUUtilization: status.Utilization.GPU,
|
||||
MemoryUtilization: status.Utilization.Memory,
|
||||
EncoderUtilization: status.Utilization.Encoder,
|
||||
DecoderUtilization: status.Utilization.Decoder,
|
||||
UsedMemoryMiB: status.Memory.Global.Used,
|
||||
ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache,
|
||||
ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache,
|
||||
ECCErrorsDevice: status.Memory.ECCErrors.Device,
|
||||
PowerUsageW: status.Power,
|
||||
BAR1UsedMiB: status.PCI.BAR1Used,
|
||||
}, nil
|
||||
}
|
||||
@@ -1,61 +0,0 @@
|
||||
package nvml
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
// UnavailableLib is returned when the nvml library could not be loaded.
|
||||
UnavailableLib = errors.New("could not load NVML library")
|
||||
)
|
||||
|
||||
// nvmlDriver implements NvmlDriver
|
||||
// Users are required to call Initialize method before using any other methods
|
||||
type nvmlDriver struct{}
|
||||
|
||||
// NvmlDriver represents set of methods to query nvml library
|
||||
type NvmlDriver interface {
|
||||
Initialize() error
|
||||
Shutdown() error
|
||||
SystemDriverVersion() (string, error)
|
||||
DeviceCount() (uint, error)
|
||||
DeviceInfoByIndex(uint) (*DeviceInfo, error)
|
||||
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
|
||||
}
|
||||
|
||||
// DeviceInfo represents nvml device data
|
||||
// this struct is returned by NvmlDriver DeviceInfoByIndex and
|
||||
// DeviceInfoAndStatusByIndex methods
|
||||
type DeviceInfo struct {
|
||||
// The following fields are guaranteed to be retrieved from nvml
|
||||
UUID string
|
||||
PCIBusID string
|
||||
DisplayState string
|
||||
PersistenceMode string
|
||||
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
Name *string
|
||||
MemoryMiB *uint64
|
||||
PowerW *uint
|
||||
BAR1MiB *uint64
|
||||
PCIBandwidthMBPerS *uint
|
||||
CoresClockMHz *uint
|
||||
MemoryClockMHz *uint
|
||||
}
|
||||
|
||||
// DeviceStatus represents nvml device status
|
||||
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
|
||||
type DeviceStatus struct {
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
PowerUsageW *uint
|
||||
TemperatureC *uint
|
||||
GPUUtilization *uint // %
|
||||
MemoryUtilization *uint // %
|
||||
EncoderUtilization *uint // %
|
||||
DecoderUtilization *uint // %
|
||||
BAR1UsedMiB *uint64
|
||||
UsedMemoryMiB *uint64
|
||||
ECCErrorsL1Cache *uint64
|
||||
ECCErrorsL2Cache *uint64
|
||||
ECCErrorsDevice *uint64
|
||||
}
|
||||
@@ -1,325 +0,0 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
)
|
||||
|
||||
const (
|
||||
// Attribute names for reporting stats output
|
||||
PowerUsageAttr = "Power usage"
|
||||
PowerUsageUnit = "W"
|
||||
PowerUsageDesc = "Power usage for this GPU in watts and " +
|
||||
"its associated circuitry (e.g. memory) / Maximum GPU Power"
|
||||
GPUUtilizationAttr = "GPU utilization"
|
||||
GPUUtilizationUnit = "%"
|
||||
GPUUtilizationDesc = "Percent of time over the past sample period " +
|
||||
"during which one or more kernels were executing on the GPU."
|
||||
MemoryUtilizationAttr = "Memory utilization"
|
||||
MemoryUtilizationUnit = "%"
|
||||
MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period"
|
||||
EncoderUtilizationAttr = "Encoder utilization"
|
||||
EncoderUtilizationUnit = "%"
|
||||
EncoderUtilizationDesc = "Percent of time over the past sample period " +
|
||||
"during which GPU Encoder was used"
|
||||
DecoderUtilizationAttr = "Decoder utilization"
|
||||
DecoderUtilizationUnit = "%"
|
||||
DecoderUtilizationDesc = "Percent of time over the past sample period " +
|
||||
"during which GPU Decoder was used"
|
||||
TemperatureAttr = "Temperature"
|
||||
TemperatureUnit = "C" // Celsius degrees
|
||||
TemperatureDesc = "Temperature of the Unit"
|
||||
MemoryStateAttr = "Memory state"
|
||||
MemoryStateUnit = "MiB" // Mebibytes
|
||||
MemoryStateDesc = "UsedMemory / TotalMemory"
|
||||
BAR1StateAttr = "BAR1 buffer state"
|
||||
BAR1StateUnit = "MiB" // Mebibytes
|
||||
BAR1StateDesc = "UsedBAR1 / TotalBAR1"
|
||||
ECCErrorsL1CacheAttr = "ECC L1 errors"
|
||||
ECCErrorsL1CacheUnit = "#" // number of errors
|
||||
ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
|
||||
ECCErrorsL2CacheAttr = "ECC L2 errors"
|
||||
ECCErrorsL2CacheUnit = "#" // number of errors
|
||||
ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
|
||||
ECCErrorsDeviceAttr = "ECC memory errors"
|
||||
ECCErrorsDeviceUnit = "#" // number of errors
|
||||
ECCErrorsDeviceDesc = "Requested memory error counter for the device"
|
||||
)
|
||||
|
||||
// stats is the long running goroutine that streams device statistics
|
||||
func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) {
|
||||
defer close(stats)
|
||||
|
||||
if d.initErr != nil {
|
||||
if d.initErr.Error() != nvml.UnavailableLib.Error() {
|
||||
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr)
|
||||
stats <- device.NewStatsError(d.initErr)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// Create a timer that will fire immediately for the first detection
|
||||
ticker := time.NewTimer(0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
ticker.Reset(interval)
|
||||
}
|
||||
|
||||
d.writeStatsToChannel(stats, time.Now())
|
||||
}
|
||||
}
|
||||
|
||||
// filterStatsByID accepts list of StatsData and set of IDs
|
||||
// this function would return entries from StatsData with IDs found in the set
|
||||
func filterStatsByID(stats []*nvml.StatsData, ids map[string]struct{}) []*nvml.StatsData {
|
||||
var filteredStats []*nvml.StatsData
|
||||
for _, statsItem := range stats {
|
||||
if _, ok := ids[statsItem.UUID]; ok {
|
||||
filteredStats = append(filteredStats, statsItem)
|
||||
}
|
||||
}
|
||||
return filteredStats
|
||||
}
|
||||
|
||||
// writeStatsToChannel collects StatsData from NVML backend, groups StatsData
|
||||
// by DeviceName attribute, populates DeviceGroupStats structure for every group
|
||||
// and sends data over provided channel
|
||||
func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
|
||||
statsData, err := d.nvmlClient.GetStatsData()
|
||||
if err != nil {
|
||||
d.logger.Error("failed to get nvidia stats", "error", err)
|
||||
stats <- &device.StatsResponse{
|
||||
Error: err,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// filter only stats from devices that are stored in NvidiaDevice struct
|
||||
d.deviceLock.RLock()
|
||||
statsData = filterStatsByID(statsData, d.devices)
|
||||
d.deviceLock.RUnlock()
|
||||
|
||||
// group stats by DeviceName struct field
|
||||
statsListByDeviceName := make(map[string][]*nvml.StatsData)
|
||||
for _, statsItem := range statsData {
|
||||
deviceName := statsItem.DeviceName
|
||||
if deviceName == nil {
|
||||
// nvml driver was not able to detect device name. This kind
|
||||
// of devices are placed to single group with 'notAvailable' name
|
||||
notAvailableCopy := notAvailable
|
||||
deviceName = ¬AvailableCopy
|
||||
}
|
||||
|
||||
statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
|
||||
}
|
||||
|
||||
// place data device.DeviceGroupStats struct for every group of stats
|
||||
deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
|
||||
for groupName, groupStats := range statsListByDeviceName {
|
||||
deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
|
||||
}
|
||||
|
||||
stats <- &device.StatsResponse{
|
||||
Groups: deviceGroupsStats,
|
||||
}
|
||||
}
|
||||
|
||||
func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue {
|
||||
return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)}
|
||||
}
|
||||
|
||||
// statsForGroup is a helper function that populates device.DeviceGroupStats
|
||||
// for given groupName with groupStats list
|
||||
func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
|
||||
instanceStats := make(map[string]*device.DeviceStats)
|
||||
for _, statsItem := range groupStats {
|
||||
instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
|
||||
}
|
||||
|
||||
return &device.DeviceGroupStats{
|
||||
Vendor: vendor,
|
||||
Type: deviceType,
|
||||
Name: groupName,
|
||||
InstanceStats: instanceStats,
|
||||
}
|
||||
}
|
||||
|
||||
// statsForItem is a helper function that populates device.DeviceStats for given
|
||||
// nvml.StatsData
|
||||
func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
|
||||
// nvml.StatsData holds pointers to values that can be nil
|
||||
// In case they are nil return stats with 'notAvailable' constant
|
||||
var (
|
||||
powerUsageStat *structs.StatValue
|
||||
GPUUtilizationStat *structs.StatValue
|
||||
memoryUtilizationStat *structs.StatValue
|
||||
encoderUtilizationStat *structs.StatValue
|
||||
decoderUtilizationStat *structs.StatValue
|
||||
temperatureStat *structs.StatValue
|
||||
memoryStateStat *structs.StatValue
|
||||
BAR1StateStat *structs.StatValue
|
||||
ECCErrorsL1CacheStat *structs.StatValue
|
||||
ECCErrorsL2CacheStat *structs.StatValue
|
||||
ECCErrorsDeviceStat *structs.StatValue
|
||||
)
|
||||
|
||||
if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
|
||||
powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
|
||||
} else {
|
||||
powerUsageStat = &structs.StatValue{
|
||||
Unit: PowerUsageUnit,
|
||||
Desc: PowerUsageDesc,
|
||||
IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)),
|
||||
IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.GPUUtilization == nil {
|
||||
GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
|
||||
} else {
|
||||
GPUUtilizationStat = &structs.StatValue{
|
||||
Unit: GPUUtilizationUnit,
|
||||
Desc: GPUUtilizationDesc,
|
||||
IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.MemoryUtilization == nil {
|
||||
memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
|
||||
} else {
|
||||
memoryUtilizationStat = &structs.StatValue{
|
||||
Unit: MemoryUtilizationUnit,
|
||||
Desc: MemoryUtilizationDesc,
|
||||
IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.EncoderUtilization == nil {
|
||||
encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
|
||||
} else {
|
||||
encoderUtilizationStat = &structs.StatValue{
|
||||
Unit: EncoderUtilizationUnit,
|
||||
Desc: EncoderUtilizationDesc,
|
||||
IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.DecoderUtilization == nil {
|
||||
decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
|
||||
} else {
|
||||
decoderUtilizationStat = &structs.StatValue{
|
||||
Unit: DecoderUtilizationUnit,
|
||||
Desc: DecoderUtilizationDesc,
|
||||
IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.TemperatureC == nil {
|
||||
temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
|
||||
} else {
|
||||
temperatureStat = &structs.StatValue{
|
||||
Unit: TemperatureUnit,
|
||||
Desc: TemperatureDesc,
|
||||
IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
|
||||
memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
|
||||
} else {
|
||||
memoryStateStat = &structs.StatValue{
|
||||
Unit: MemoryStateUnit,
|
||||
Desc: MemoryStateDesc,
|
||||
IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB),
|
||||
IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
|
||||
BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
|
||||
} else {
|
||||
BAR1StateStat = &structs.StatValue{
|
||||
Unit: BAR1StateUnit,
|
||||
Desc: BAR1StateDesc,
|
||||
IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB),
|
||||
IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.ECCErrorsL1Cache == nil {
|
||||
ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
|
||||
} else {
|
||||
ECCErrorsL1CacheStat = &structs.StatValue{
|
||||
Unit: ECCErrorsL1CacheUnit,
|
||||
Desc: ECCErrorsL1CacheDesc,
|
||||
IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.ECCErrorsL2Cache == nil {
|
||||
ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
|
||||
} else {
|
||||
ECCErrorsL2CacheStat = &structs.StatValue{
|
||||
Unit: ECCErrorsL2CacheUnit,
|
||||
Desc: ECCErrorsL2CacheDesc,
|
||||
IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.ECCErrorsDevice == nil {
|
||||
ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
|
||||
} else {
|
||||
ECCErrorsDeviceStat = &structs.StatValue{
|
||||
Unit: ECCErrorsDeviceUnit,
|
||||
Desc: ECCErrorsDeviceDesc,
|
||||
IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice),
|
||||
}
|
||||
}
|
||||
return &device.DeviceStats{
|
||||
Summary: memoryStateStat,
|
||||
Stats: &structs.StatObject{
|
||||
Attributes: map[string]*structs.StatValue{
|
||||
PowerUsageAttr: powerUsageStat,
|
||||
GPUUtilizationAttr: GPUUtilizationStat,
|
||||
MemoryUtilizationAttr: memoryUtilizationStat,
|
||||
EncoderUtilizationAttr: encoderUtilizationStat,
|
||||
DecoderUtilizationAttr: decoderUtilizationStat,
|
||||
TemperatureAttr: temperatureStat,
|
||||
MemoryStateAttr: memoryStateStat,
|
||||
BAR1StateAttr: BAR1StateStat,
|
||||
ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat,
|
||||
ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat,
|
||||
ECCErrorsDeviceAttr: ECCErrorsDeviceStat,
|
||||
},
|
||||
},
|
||||
Timestamp: timestamp,
|
||||
}
|
||||
}
|
||||
|
||||
func uintToInt64Ptr(u *uint) *int64 {
|
||||
if u == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
v := int64(*u)
|
||||
return &v
|
||||
}
|
||||
|
||||
func uint64ToInt64Ptr(u *uint64) *int64 {
|
||||
if u == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
v := int64(*u)
|
||||
return &v
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
2
go.mod
2
go.mod
@@ -19,7 +19,6 @@ require (
|
||||
github.com/Azure/go-autorest/autorest/azure/auth v0.5.1 // indirect
|
||||
github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5
|
||||
github.com/Microsoft/go-winio v0.4.15-0.20200113171025-3fe6c5262873
|
||||
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5
|
||||
github.com/NYTimes/gziphandler v1.0.1
|
||||
github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e
|
||||
github.com/armon/go-metrics v0.3.4
|
||||
@@ -32,6 +31,7 @@ require (
|
||||
github.com/coreos/go-iptables v0.4.3-0.20190724151750-969b135e941d
|
||||
github.com/coreos/go-semver v0.3.0
|
||||
github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 // indirect
|
||||
github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba // indirect
|
||||
github.com/docker/cli v0.0.0-20200303215952-eb310fca4956
|
||||
github.com/docker/distribution v2.7.1+incompatible
|
||||
github.com/docker/docker v17.12.0-ce-rc1.0.20200330121334-7f8b4b621b5d+incompatible
|
||||
|
||||
5
go.sum
5
go.sum
@@ -64,8 +64,6 @@ github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5/go.mod h1:nxQPcN
|
||||
github.com/Microsoft/hcsshim v0.8.7/go.mod h1:OHd7sQqRFrYd3RmSgbgji+ctCwkbq2wbEYNSzOYtcBQ=
|
||||
github.com/Microsoft/hcsshim v0.8.9 h1:VrfodqvztU8YSOvygU+DN1BGaSGxmrNfqOv5oOuX2Bk=
|
||||
github.com/Microsoft/hcsshim v0.8.9/go.mod h1:5692vkUqntj1idxauYlpoINNKeqCiG6Sg38RRsjT5y8=
|
||||
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY=
|
||||
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA=
|
||||
github.com/NYTimes/gziphandler v1.0.0 h1:OswZCvpiFsNRCbeapdJxDuikAqVXTgV7XAht8S9olZo=
|
||||
github.com/NYTimes/gziphandler v1.0.0/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
|
||||
github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
|
||||
@@ -182,8 +180,9 @@ github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1/go.mo
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/denverdino/aliyungo v0.0.0-20170926055100-d3308649c661 h1:lrWnAyy/F72MbxIxFUzKmcMCdt9Oi8RzpAxzTNQHD7o=
|
||||
github.com/denverdino/aliyungo v0.0.0-20170926055100-d3308649c661/go.mod h1:dV8lFg6daOBZbT6/BDGIz6Y3WFGn8juu6G+CQ6LHtl0=
|
||||
github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba h1:p6poVbjHDkKa+wtC8frBMwQtT3BmqGYBjzMwJ63tuR4=
|
||||
github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba/go.mod h1:dV8lFg6daOBZbT6/BDGIz6Y3WFGn8juu6G+CQ6LHtl0=
|
||||
github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
|
||||
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
|
||||
github.com/digitalocean/godo v1.7.5/go.mod h1:h6faOIcZ8lWIwNQ+DN7b3CgX4Kwby5T+nbpNqkUIozU=
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
// +build !nonvidia
|
||||
|
||||
package catalog
|
||||
|
||||
import (
|
||||
"github.com/hashicorp/nomad/devices/gpu/nvidia"
|
||||
)
|
||||
|
||||
// This file is where all builtin plugins should be registered in the catalog.
|
||||
// Plugins with build restrictions should be placed in the appropriate
|
||||
// register_XXX.go file.
|
||||
func init() {
|
||||
Register(nvidia.PluginID, nvidia.PluginConfig)
|
||||
}
|
||||
33
website/content/docs/devices/external/index.mdx
vendored
33
website/content/docs/devices/external/index.mdx
vendored
@@ -1,30 +1,29 @@
|
||||
---
|
||||
layout: docs
|
||||
page_title: 'Device Plugins: Community Supported'
|
||||
description: A list of community supported Device Plugins.
|
||||
page_title: 'Device Plugins: External'
|
||||
description: 'A list of external Device Plugins.'
|
||||
---
|
||||
|
||||
# Community Supported
|
||||
|
||||
If you have authored a device plugin that you believe will be useful to the
|
||||
broader Nomad community and you are committed to maintaining the plugin, please
|
||||
file a PR to add your plugin to this page.
|
||||
|
||||
## Device Plugins
|
||||
# External Device Plugins
|
||||
|
||||
Nomad has a plugin system for defining task drivers. External device driver
|
||||
plugins will have the same user experience as built in devices.
|
||||
|
||||
Below is a list of community-supported task drivers you can use with Nomad:
|
||||
Below is a list of official external task drivers you can use with Nomad:
|
||||
|
||||
- [Nvidia][nvidia]
|
||||
|
||||
## Community Supported
|
||||
|
||||
If you have authored a device plugin that you believe will be useful to the
|
||||
broader Nomad community and you are committed to maintaining the plugin,
|
||||
please file a PR to add your plugin to this page. For details on authoring a
|
||||
device plugin, please refer to the [plugin authoring guide][plugin_guide].
|
||||
|
||||
Below is a list of community-support task drivers you can use with Nomad:
|
||||
|
||||
- [USB][usb]
|
||||
|
||||
## Authoring Device Plugins
|
||||
|
||||
Nomad has a plugin system for defining device drivers. External device plugins
|
||||
will have the same user experience as built in drivers. For details on
|
||||
authoring a device plugin, please refer to the [plugin authoring
|
||||
guide][plugin_guide].
|
||||
|
||||
[plugin_guide]: /docs/internals/plugins
|
||||
[nvidia]: /docs/devices/external/nvidia
|
||||
[usb]: /docs/devices/external/usb
|
||||
|
||||
@@ -6,18 +6,13 @@ description: Device Plugins are used to expose devices to tasks in Nomad.
|
||||
|
||||
# Device Plugins
|
||||
|
||||
Device plugins are used to detect and make devices available to tasks in Nomad.
|
||||
Devices are physical hardware that exists on a node such as a GPU or an FPGA. By
|
||||
having extensible device plugins, Nomad has the flexibility to support a broad
|
||||
set of devices and allows the community to build additional device plugins as
|
||||
needed.
|
||||
Device plugins are used to detect and make devices available to tasks in
|
||||
Nomad. Devices are physical hardware that exists on a client node such as a
|
||||
GPU or an FPGA. By having extensible device plugins, Nomad has the flexibility
|
||||
to support a broad set of devices and allows the community to build additional
|
||||
device plugins as needed.
|
||||
|
||||
The list of supported device plugins is provided on the left of this page.
|
||||
Each device plugin documents its configuration and installation requirements,
|
||||
the attributes it fingerprints, and the environment variables it exposes to
|
||||
tasks.
|
||||
|
||||
For details on authoring a device plugin, please refer to the [plugin authoring
|
||||
guide][plugin_guide].
|
||||
|
||||
[plugin_guide]: /docs/internals/plugins
|
||||
|
||||
@@ -1442,16 +1442,16 @@
|
||||
"path": "devices"
|
||||
},
|
||||
{
|
||||
"title": "Nvidia",
|
||||
"path": "devices/nvidia"
|
||||
},
|
||||
{
|
||||
"title": "Community",
|
||||
"title": "External",
|
||||
"routes": [
|
||||
{
|
||||
"title": "Overview",
|
||||
"path": "devices/external"
|
||||
},
|
||||
{
|
||||
"title": "Nvidia",
|
||||
"path": "devices/external/nvidia"
|
||||
},
|
||||
{
|
||||
"title": "USB <sup>Beta</sup>",
|
||||
"path": "devices/external/usb"
|
||||
|
||||
Reference in New Issue
Block a user