Merge pull request #10796 from hashicorp/external-nvidia

devices: externalize nvidia device driver
This commit is contained in:
Michael Schurter
2021-09-29 15:52:45 -07:00
committed by GitHub
21 changed files with 30 additions and 6192 deletions

View File

@@ -625,13 +625,9 @@ workflows:
test_module: "api"
filters: *backend_test_branches_filter
enable_race_testing: true
- test-container:
name: "test-devices"
test_packages: "./devices/..."
filters: *backend_test_branches_filter
- test-machine:
name: "test-other"
exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./devices|./e2e"
exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./e2e"
filters: *backend_test_branches_filter
- test-machine:
name: "test-docker"

View File

@@ -1,21 +0,0 @@
This package provides an implementation of nvidia device plugin
# Behavior
Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period.
# Config
The configuration should be passed via an HCL file that begins with a top level `config` stanza:
```
config {
ignored_gpu_ids = ["uuid1", "uuid2"]
fingerprint_period = "5s"
}
```
The valid configuration options are:
* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes.

View File

@@ -1,20 +0,0 @@
package main
import (
"context"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia"
"github.com/hashicorp/nomad/plugins"
)
func main() {
// Serve the plugin
plugins.ServeCtx(factory)
}
// factory returns a new instance of the Nvidia GPU plugin
func factory(ctx context.Context, log log.Logger) interface{} {
return nvidia.NewNvidiaDevice(ctx, log)
}

View File

@@ -1,228 +0,0 @@
package nvidia
import (
"context"
"fmt"
"strings"
"sync"
"time"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/helper/pluginutils/loader"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/shared/hclspec"
)
const (
// pluginName is the name of the plugin
pluginName = "nvidia-gpu"
// vendor is the vendor providing the devices
vendor = "nvidia"
// deviceType is the type of device being returned
deviceType = device.DeviceTypeGPU
// notAvailable value is returned to nomad server in case some properties were
// undetected by nvml driver
notAvailable = "N/A"
// Nvidia-container-runtime environment variable names
NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
)
var (
// PluginID is the nvidia plugin metadata registered in the plugin
// catalog.
PluginID = loader.PluginID{
Name: pluginName,
PluginType: base.PluginTypeDevice,
}
// PluginConfig is the nvidia factory function registered in the
// plugin catalog.
PluginConfig = &loader.InternalPluginConfig{
Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) },
}
// pluginInfo describes the plugin
pluginInfo = &base.PluginInfoResponse{
Type: base.PluginTypeDevice,
PluginApiVersions: []string{device.ApiVersion010},
PluginVersion: "0.1.0",
Name: pluginName,
}
// configSpec is the specification of the plugin's configuration
configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
"enabled": hclspec.NewDefault(
hclspec.NewAttr("enabled", "bool", false),
hclspec.NewLiteral("true"),
),
"ignored_gpu_ids": hclspec.NewDefault(
hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
hclspec.NewLiteral("[]"),
),
"fingerprint_period": hclspec.NewDefault(
hclspec.NewAttr("fingerprint_period", "string", false),
hclspec.NewLiteral("\"1m\""),
),
})
)
// Config contains configuration information for the plugin.
type Config struct {
Enabled bool `codec:"enabled"`
IgnoredGPUIDs []string `codec:"ignored_gpu_ids"`
FingerprintPeriod string `codec:"fingerprint_period"`
}
// NvidiaDevice contains all plugin specific data
type NvidiaDevice struct {
// enabled indicates whether the plugin should be enabled
enabled bool
// nvmlClient is used to get data from nvidia
nvmlClient nvml.NvmlClient
// initErr holds an error retrieved during
// nvmlClient initialization
initErr error
// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
ignoredGPUIDs map[string]struct{}
// fingerprintPeriod is how often we should call nvml to get list of devices
fingerprintPeriod time.Duration
// devices is the set of detected eligible devices
devices map[string]struct{}
deviceLock sync.RWMutex
logger log.Logger
}
// NewNvidiaDevice returns a new nvidia device plugin.
func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice {
nvmlClient, err := nvml.NewNvmlClient()
logger := log.Named(pluginName)
if err != nil && err.Error() != nvml.UnavailableLib.Error() {
logger.Error("unable to initialize Nvidia driver", "reason", err)
}
return &NvidiaDevice{
logger: logger,
devices: make(map[string]struct{}),
ignoredGPUIDs: make(map[string]struct{}),
nvmlClient: nvmlClient,
initErr: err,
}
}
// PluginInfo returns information describing the plugin.
func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
return pluginInfo, nil
}
// ConfigSchema returns the plugins configuration schema.
func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
return configSpec, nil
}
// SetConfig is used to set the configuration of the plugin.
func (d *NvidiaDevice) SetConfig(cfg *base.Config) error {
var config Config
if len(cfg.PluginConfig) != 0 {
if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil {
return err
}
}
d.enabled = config.Enabled
for _, ignoredGPUId := range config.IgnoredGPUIDs {
d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
}
period, err := time.ParseDuration(config.FingerprintPeriod)
if err != nil {
return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
}
d.fingerprintPeriod = period
return nil
}
// Fingerprint streams detected devices. If device changes are detected or the
// devices health changes, messages will be emitted.
func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
if !d.enabled {
return nil, device.ErrPluginDisabled
}
outCh := make(chan *device.FingerprintResponse)
go d.fingerprint(ctx, outCh)
return outCh, nil
}
type reservationError struct {
notExistingIDs []string
}
func (e *reservationError) Error() string {
return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
}
// Reserve returns information on how to mount given devices.
// Assumption is made that nomad server is responsible for correctness of
// GPU allocations, handling tricky cases such as double-allocation of single GPU
func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
if len(deviceIDs) == 0 {
return &device.ContainerReservation{}, nil
}
if !d.enabled {
return nil, device.ErrPluginDisabled
}
// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
// of race condition
//
// Timeline:
// 1 - fingerprint reports that GPU with id "1" is present
// 2 - the following events happen at the same time:
// a) server decides to allocate GPU with id "1"
// b) fingerprint check reports that GPU with id "1" is no more present
//
// The latest and always valid version of fingerprinted ids are stored in
// d.devices map. To avoid this race condition an error is returned if
// any of provided deviceIDs is not found in d.devices map
d.deviceLock.RLock()
var notExistingIDs []string
for _, id := range deviceIDs {
if _, deviceIDExists := d.devices[id]; !deviceIDExists {
notExistingIDs = append(notExistingIDs, id)
}
}
d.deviceLock.RUnlock()
if len(notExistingIDs) != 0 {
return nil, &reservationError{notExistingIDs}
}
return &device.ContainerReservation{
Envs: map[string]string{
NvidiaVisibleDevices: strings.Join(deviceIDs, ","),
},
}, nil
}
// Stats streams statistics for the detected devices.
func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) {
if !d.enabled {
return nil, device.ErrPluginDisabled
}
outCh := make(chan *device.StatsResponse)
go d.stats(ctx, outCh, interval)
return outCh, nil
}

View File

@@ -1,140 +0,0 @@
package nvidia
import (
"testing"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/plugins/device"
"github.com/stretchr/testify/require"
)
type MockNvmlClient struct {
FingerprintError error
FingerprintResponseReturned *nvml.FingerprintData
StatsError error
StatsResponseReturned []*nvml.StatsData
}
func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) {
return c.FingerprintResponseReturned, c.FingerprintError
}
func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
return c.StatsResponseReturned, c.StatsError
}
func TestReserve(t *testing.T) {
cases := []struct {
Name string
ExpectedReservation *device.ContainerReservation
ExpectedError error
Device *NvidiaDevice
RequestedIDs []string
}{
{
Name: "All RequestedIDs are not managed by Device",
ExpectedReservation: nil,
ExpectedError: &reservationError{[]string{
"UUID1",
"UUID2",
"UUID3",
}},
RequestedIDs: []string{
"UUID1",
"UUID2",
"UUID3",
},
Device: &NvidiaDevice{
logger: hclog.NewNullLogger(),
enabled: true,
},
},
{
Name: "Some RequestedIDs are not managed by Device",
ExpectedReservation: nil,
ExpectedError: &reservationError{[]string{
"UUID1",
"UUID2",
}},
RequestedIDs: []string{
"UUID1",
"UUID2",
"UUID3",
},
Device: &NvidiaDevice{
devices: map[string]struct{}{
"UUID3": {},
},
logger: hclog.NewNullLogger(),
enabled: true,
},
},
{
Name: "All RequestedIDs are managed by Device",
ExpectedReservation: &device.ContainerReservation{
Envs: map[string]string{
NvidiaVisibleDevices: "UUID1,UUID2,UUID3",
},
},
ExpectedError: nil,
RequestedIDs: []string{
"UUID1",
"UUID2",
"UUID3",
},
Device: &NvidiaDevice{
devices: map[string]struct{}{
"UUID1": {},
"UUID2": {},
"UUID3": {},
},
logger: hclog.NewNullLogger(),
enabled: true,
},
},
{
Name: "No IDs requested",
ExpectedReservation: &device.ContainerReservation{},
ExpectedError: nil,
RequestedIDs: nil,
Device: &NvidiaDevice{
devices: map[string]struct{}{
"UUID1": {},
"UUID2": {},
"UUID3": {},
},
logger: hclog.NewNullLogger(),
enabled: true,
},
},
{
Name: "Device is disabled",
ExpectedReservation: nil,
ExpectedError: device.ErrPluginDisabled,
RequestedIDs: []string{
"UUID1",
"UUID2",
"UUID3",
},
Device: &NvidiaDevice{
devices: map[string]struct{}{
"UUID1": {},
"UUID2": {},
"UUID3": {},
},
logger: hclog.NewNullLogger(),
enabled: false,
},
},
}
for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
actualReservation, actualError := c.Device.Reserve(c.RequestedIDs)
require.Equal(t, c.ExpectedReservation, actualReservation)
require.Equal(t, c.ExpectedError, actualError)
})
}
}

View File

@@ -1,229 +0,0 @@
package nvidia
import (
"context"
"time"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/shared/structs"
)
const (
// Attribute names and units for reporting Fingerprint output
MemoryAttr = "memory"
PowerAttr = "power"
BAR1Attr = "bar1"
DriverVersionAttr = "driver_version"
CoresClockAttr = "cores_clock"
MemoryClockAttr = "memory_clock"
PCIBandwidthAttr = "pci_bandwidth"
DisplayStateAttr = "display_state"
PersistenceModeAttr = "persistence_mode"
)
// fingerprint is the long running goroutine that detects hardware
func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
defer close(devices)
if d.initErr != nil {
if d.initErr.Error() != nvml.UnavailableLib.Error() {
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr)
devices <- device.NewFingerprintError(d.initErr)
}
// Just close the channel to let server know that there are no working
// Nvidia GPU units
return
}
// Create a timer that will fire immediately for the first detection
ticker := time.NewTimer(0)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ticker.Reset(d.fingerprintPeriod)
}
d.writeFingerprintToChannel(devices)
}
}
// writeFingerprintToChannel makes nvml call and writes response to channel
func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
fingerprintData, err := d.nvmlClient.GetFingerprintData()
if err != nil {
d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
devices <- device.NewFingerprintError(err)
return
}
// ignore devices from fingerprint output
fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
// check if any device health was updated or any device was added to host
if !d.fingerprintChanged(fingerprintDevices) {
return
}
commonAttributes := map[string]*structs.Attribute{
DriverVersionAttr: {
String: helper.StringToPtr(fingerprintData.DriverVersion),
},
}
// Group all FingerprintDevices by DeviceName attribute
deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
for _, device := range fingerprintDevices {
deviceName := device.DeviceName
if deviceName == nil {
// nvml driver was not able to detect device name. This kind
// of devices are placed to single group with 'notAvailable' name
notAvailableCopy := notAvailable
deviceName = &notAvailableCopy
}
deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
}
// Build Fingerprint response with computed groups and send it over the channel
deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
for groupName, devices := range deviceListByDeviceName {
deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
}
devices <- device.NewFingerprint(deviceGroups...)
}
// ignoreFingerprintedDevices excludes ignored devices from fingerprint output
func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
var result []*nvml.FingerprintDeviceData
for _, fingerprintDevice := range deviceData {
if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
result = append(result, fingerprintDevice)
}
}
return result
}
// fingerprintChanged checks if there are any previously unseen nvidia devices located
// or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
// Also, this func updates device map on NvidiaDevice with the latest data
func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
d.deviceLock.Lock()
defer d.deviceLock.Unlock()
changeDetected := false
// check if every device in allDevices is in d.devices
for _, device := range allDevices {
if _, ok := d.devices[device.UUID]; !ok {
changeDetected = true
}
}
// check if every device in d.devices is in allDevices
fingerprintDeviceMap := make(map[string]struct{})
for _, device := range allDevices {
fingerprintDeviceMap[device.UUID] = struct{}{}
}
for id := range d.devices {
if _, ok := fingerprintDeviceMap[id]; !ok {
changeDetected = true
}
}
d.devices = fingerprintDeviceMap
return changeDetected
}
// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup {
// deviceGroup without devices makes no sense -> return nil when no devices are provided
if len(deviceList) == 0 {
return nil
}
devices := make([]*device.Device, len(deviceList))
for index, dev := range deviceList {
devices[index] = &device.Device{
ID: dev.UUID,
// all fingerprinted devices are "healthy" for now
// to get real health data -> dcgm bindings should be used
Healthy: true,
HwLocality: &device.DeviceLocality{
PciBusID: dev.PCIBusID,
},
}
}
deviceGroup := &device.DeviceGroup{
Vendor: vendor,
Type: deviceType,
Name: groupName,
Devices: devices,
// Assumption made that devices with the same DeviceName have the same
// attributes like amount of memory, power, bar1memory etc
Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
}
// Extend attribute map with common attributes
for attributeKey, attributeValue := range commonAttributes {
deviceGroup.Attributes[attributeKey] = attributeValue
}
return deviceGroup
}
// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
// struct to device.DeviceGroup.Attributes format (map[string]string)
// this function performs all nil checks for FingerprintDeviceData pointers
func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute {
attrs := map[string]*structs.Attribute{
DisplayStateAttr: {
String: helper.StringToPtr(d.DisplayState),
},
PersistenceModeAttr: {
String: helper.StringToPtr(d.PersistenceMode),
},
}
if d.MemoryMiB != nil {
attrs[MemoryAttr] = &structs.Attribute{
Int: helper.Int64ToPtr(int64(*d.MemoryMiB)),
Unit: structs.UnitMiB,
}
}
if d.PowerW != nil {
attrs[PowerAttr] = &structs.Attribute{
Int: helper.Int64ToPtr(int64(*d.PowerW)),
Unit: structs.UnitW,
}
}
if d.BAR1MiB != nil {
attrs[BAR1Attr] = &structs.Attribute{
Int: helper.Int64ToPtr(int64(*d.BAR1MiB)),
Unit: structs.UnitMiB,
}
}
if d.CoresClockMHz != nil {
attrs[CoresClockAttr] = &structs.Attribute{
Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)),
Unit: structs.UnitMHz,
}
}
if d.MemoryClockMHz != nil {
attrs[MemoryClockAttr] = &structs.Attribute{
Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)),
Unit: structs.UnitMHz,
}
}
if d.PCIBandwidthMBPerS != nil {
attrs[PCIBandwidthAttr] = &structs.Attribute{
Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)),
Unit: structs.UnitMBPerS,
}
}
return attrs
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,194 +0,0 @@
package nvml
import (
"fmt"
)
// DeviceData represents common fields for Nvidia device
type DeviceData struct {
UUID string
DeviceName *string
MemoryMiB *uint64
PowerW *uint
BAR1MiB *uint64
}
// FingerprintDeviceData is a superset of DeviceData
// it describes device specific fields returned from
// nvml queries during fingerprinting call
type FingerprintDeviceData struct {
*DeviceData
PCIBandwidthMBPerS *uint
CoresClockMHz *uint
MemoryClockMHz *uint
DisplayState string
PersistenceMode string
PCIBusID string
}
// FingerprintData represets attributes of driver/devices
type FingerprintData struct {
Devices []*FingerprintDeviceData
DriverVersion string
}
// StatsData is a superset of DeviceData
// it represents statistics data returned for every Nvidia device
type StatsData struct {
*DeviceData
PowerUsageW *uint
GPUUtilization *uint
MemoryUtilization *uint
EncoderUtilization *uint
DecoderUtilization *uint
TemperatureC *uint
UsedMemoryMiB *uint64
BAR1UsedMiB *uint64
ECCErrorsL1Cache *uint64
ECCErrorsL2Cache *uint64
ECCErrorsDevice *uint64
}
// NvmlClient describes how users would use nvml library
type NvmlClient interface {
GetFingerprintData() (*FingerprintData, error)
GetStatsData() ([]*StatsData, error)
}
// nvmlClient implements NvmlClient
// Users of this lib are expected to use this struct via NewNvmlClient func
type nvmlClient struct {
driver NvmlDriver
}
// NewNvmlClient function creates new nvmlClient with real
// NvmlDriver implementation. Also, this func initializes NvmlDriver
func NewNvmlClient() (*nvmlClient, error) {
driver := &nvmlDriver{}
err := driver.Initialize()
if err != nil {
return nil, err
}
return &nvmlClient{
driver: driver,
}, nil
}
// GetFingerprintData returns FingerprintData for available Nvidia devices
func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
/*
nvml fields to be fingerprinted # nvml_library_call
1 - Driver Version # nvmlSystemGetDriverVersion
2 - Product Name # nvmlDeviceGetName
3 - GPU UUID # nvmlDeviceGetUUID
4 - Total Memory # nvmlDeviceGetMemoryInfo
5 - Power # nvmlDeviceGetPowerManagementLimit
6 - PCIBusID # nvmlDeviceGetPciInfo
7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo(
8 - PCI Bandwidth
9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo
10 - Display Mode # nvmlDeviceGetDisplayMode
11 - Persistence Mode # nvmlDeviceGetPersistenceMode
*/
// Assumed that this method is called with receiver retrieved from
// NewNvmlClient
// because this method handles initialization of NVML library
driverVersion, err := c.driver.SystemDriverVersion()
if err != nil {
return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
}
numDevices, err := c.driver.DeviceCount()
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
}
allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
for i := 0; i < int(numDevices); i++ {
deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
}
allNvidiaGPUResources[i] = &FingerprintDeviceData{
DeviceData: &DeviceData{
DeviceName: deviceInfo.Name,
UUID: deviceInfo.UUID,
MemoryMiB: deviceInfo.MemoryMiB,
PowerW: deviceInfo.PowerW,
BAR1MiB: deviceInfo.BAR1MiB,
},
PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
CoresClockMHz: deviceInfo.CoresClockMHz,
MemoryClockMHz: deviceInfo.MemoryClockMHz,
DisplayState: deviceInfo.DisplayState,
PersistenceMode: deviceInfo.PersistenceMode,
PCIBusID: deviceInfo.PCIBusID,
}
}
return &FingerprintData{
Devices: allNvidiaGPUResources,
DriverVersion: driverVersion,
}, nil
}
// GetStatsData returns statistics data for all devices on this machine
func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
/*
nvml fields to be reported to stats api # nvml_library_call
1 - Used Memory # nvmlDeviceGetMemoryInfo
2 - Utilization of GPU # nvmlDeviceGetUtilizationRates
3 - Utilization of Memory # nvmlDeviceGetUtilizationRates
4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization
5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization
6 - Current GPU Temperature # nvmlDeviceGetTemperature
7 - Power Draw # nvmlDeviceGetPowerUsage
8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo
9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter
10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter
11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
*/
// Assumed that this method is called with receiver retrieved from
// NewNvmlClient
// because this method handles initialization of NVML library
numDevices, err := c.driver.DeviceCount()
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
}
allNvidiaGPUStats := make([]*StatsData, numDevices)
for i := 0; i < int(numDevices); i++ {
deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
}
allNvidiaGPUStats[i] = &StatsData{
DeviceData: &DeviceData{
DeviceName: deviceInfo.Name,
UUID: deviceInfo.UUID,
MemoryMiB: deviceInfo.MemoryMiB,
PowerW: deviceInfo.PowerW,
BAR1MiB: deviceInfo.BAR1MiB,
},
PowerUsageW: deviceStatus.PowerUsageW,
GPUUtilization: deviceStatus.GPUUtilization,
MemoryUtilization: deviceStatus.MemoryUtilization,
EncoderUtilization: deviceStatus.EncoderUtilization,
DecoderUtilization: deviceStatus.DecoderUtilization,
TemperatureC: deviceStatus.TemperatureC,
UsedMemoryMiB: deviceStatus.UsedMemoryMiB,
BAR1UsedMiB: deviceStatus.BAR1UsedMiB,
ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache,
ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache,
ECCErrorsDevice: deviceStatus.ECCErrorsDevice,
}
}
return allNvidiaGPUStats, nil
}

View File

@@ -1,399 +0,0 @@
package nvml
import (
"errors"
"testing"
"github.com/hashicorp/nomad/helper"
"github.com/stretchr/testify/require"
)
type MockNVMLDriver struct {
systemDriverCallSuccessful bool
deviceCountCallSuccessful bool
deviceInfoByIndexCallSuccessful bool
deviceInfoAndStatusByIndexCallSuccessful bool
driverVersion string
devices []*DeviceInfo
deviceStatus []*DeviceStatus
}
func (m *MockNVMLDriver) Initialize() error {
return nil
}
func (m *MockNVMLDriver) Shutdown() error {
return nil
}
func (m *MockNVMLDriver) SystemDriverVersion() (string, error) {
if !m.systemDriverCallSuccessful {
return "", errors.New("failed to get system driver")
}
return m.driverVersion, nil
}
func (m *MockNVMLDriver) DeviceCount() (uint, error) {
if !m.deviceCountCallSuccessful {
return 0, errors.New("failed to get device length")
}
return uint(len(m.devices)), nil
}
func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
if index >= uint(len(m.devices)) {
return nil, errors.New("index is out of range")
}
if !m.deviceInfoByIndexCallSuccessful {
return nil, errors.New("failed to get device info by index")
}
return m.devices[index], nil
}
func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) {
return nil, nil, errors.New("index is out of range")
}
if !m.deviceInfoAndStatusByIndexCallSuccessful {
return nil, nil, errors.New("failed to get device info and status by index")
}
return m.devices[index], m.deviceStatus[index], nil
}
func TestGetFingerprintDataFromNVML(t *testing.T) {
for _, testCase := range []struct {
Name string
DriverConfiguration *MockNVMLDriver
ExpectedError bool
ExpectedResult *FingerprintData
}{
{
Name: "fail on systemDriverCallSuccessful",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: false,
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: true,
},
},
{
Name: "fail on deviceCountCallSuccessful",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: false,
deviceInfoByIndexCallSuccessful: true,
},
},
{
Name: "fail on deviceInfoByIndexCall",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: false,
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
},
},
},
},
{
Name: "successful outcome",
ExpectedError: false,
ExpectedResult: &FingerprintData{
DriverVersion: "driverVersion",
Devices: []*FingerprintDeviceData{
{
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName1"),
UUID: "UUID1",
MemoryMiB: helper.Uint64ToPtr(16),
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
},
PCIBusID: "busId1",
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
}, {
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName2"),
UUID: "UUID2",
MemoryMiB: helper.Uint64ToPtr(8),
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
},
PCIBusID: "busId2",
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
},
},
},
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: true,
driverVersion: "driverVersion",
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId1",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId2",
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
},
},
},
},
} {
cli := nvmlClient{driver: testCase.DriverConfiguration}
fingerprintData, err := cli.GetFingerprintData()
if testCase.ExpectedError && err == nil {
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
}
if !testCase.ExpectedError && err != nil {
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
}
require.New(t).Equal(testCase.ExpectedResult, fingerprintData)
}
}
func TestGetStatsDataFromNVML(t *testing.T) {
for _, testCase := range []struct {
Name string
DriverConfiguration *MockNVMLDriver
ExpectedError bool
ExpectedResult []*StatsData
}{
{
Name: "fail on deviceCountCallSuccessful",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: false,
deviceInfoByIndexCallSuccessful: true,
deviceInfoAndStatusByIndexCallSuccessful: true,
},
},
{
Name: "fail on DeviceInfoAndStatusByIndex call",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: true,
deviceInfoAndStatusByIndexCallSuccessful: false,
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId1",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId2",
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
},
},
deviceStatus: []*DeviceStatus{
{
TemperatureC: helper.UintToPtr(1),
GPUUtilization: helper.UintToPtr(1),
MemoryUtilization: helper.UintToPtr(1),
EncoderUtilization: helper.UintToPtr(1),
DecoderUtilization: helper.UintToPtr(1),
UsedMemoryMiB: helper.Uint64ToPtr(1),
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
ECCErrorsDevice: helper.Uint64ToPtr(1),
PowerUsageW: helper.UintToPtr(1),
BAR1UsedMiB: helper.Uint64ToPtr(1),
},
{
TemperatureC: helper.UintToPtr(2),
GPUUtilization: helper.UintToPtr(2),
MemoryUtilization: helper.UintToPtr(2),
EncoderUtilization: helper.UintToPtr(2),
DecoderUtilization: helper.UintToPtr(2),
UsedMemoryMiB: helper.Uint64ToPtr(2),
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
ECCErrorsDevice: helper.Uint64ToPtr(2),
PowerUsageW: helper.UintToPtr(2),
BAR1UsedMiB: helper.Uint64ToPtr(2),
},
},
},
},
{
Name: "successful outcome",
ExpectedError: false,
ExpectedResult: []*StatsData{
{
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName1"),
UUID: "UUID1",
MemoryMiB: helper.Uint64ToPtr(16),
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
},
TemperatureC: helper.UintToPtr(1),
GPUUtilization: helper.UintToPtr(1),
MemoryUtilization: helper.UintToPtr(1),
EncoderUtilization: helper.UintToPtr(1),
DecoderUtilization: helper.UintToPtr(1),
UsedMemoryMiB: helper.Uint64ToPtr(1),
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
ECCErrorsDevice: helper.Uint64ToPtr(1),
PowerUsageW: helper.UintToPtr(1),
BAR1UsedMiB: helper.Uint64ToPtr(1),
},
{
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName2"),
UUID: "UUID2",
MemoryMiB: helper.Uint64ToPtr(8),
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
},
TemperatureC: helper.UintToPtr(2),
GPUUtilization: helper.UintToPtr(2),
MemoryUtilization: helper.UintToPtr(2),
EncoderUtilization: helper.UintToPtr(2),
DecoderUtilization: helper.UintToPtr(2),
UsedMemoryMiB: helper.Uint64ToPtr(2),
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
ECCErrorsDevice: helper.Uint64ToPtr(2),
PowerUsageW: helper.UintToPtr(2),
BAR1UsedMiB: helper.Uint64ToPtr(2),
},
},
DriverConfiguration: &MockNVMLDriver{
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: true,
deviceInfoAndStatusByIndexCallSuccessful: true,
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId1",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId2",
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
},
},
deviceStatus: []*DeviceStatus{
{
TemperatureC: helper.UintToPtr(1),
GPUUtilization: helper.UintToPtr(1),
MemoryUtilization: helper.UintToPtr(1),
EncoderUtilization: helper.UintToPtr(1),
DecoderUtilization: helper.UintToPtr(1),
UsedMemoryMiB: helper.Uint64ToPtr(1),
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
ECCErrorsDevice: helper.Uint64ToPtr(1),
PowerUsageW: helper.UintToPtr(1),
BAR1UsedMiB: helper.Uint64ToPtr(1),
},
{
TemperatureC: helper.UintToPtr(2),
GPUUtilization: helper.UintToPtr(2),
MemoryUtilization: helper.UintToPtr(2),
EncoderUtilization: helper.UintToPtr(2),
DecoderUtilization: helper.UintToPtr(2),
UsedMemoryMiB: helper.Uint64ToPtr(2),
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
ECCErrorsDevice: helper.Uint64ToPtr(2),
PowerUsageW: helper.UintToPtr(2),
BAR1UsedMiB: helper.Uint64ToPtr(2),
},
},
},
},
} {
cli := nvmlClient{driver: testCase.DriverConfiguration}
statsData, err := cli.GetStatsData()
if testCase.ExpectedError && err == nil {
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
}
if !testCase.ExpectedError && err != nil {
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
}
require.New(t).Equal(testCase.ExpectedResult, statsData)
}
}

View File

@@ -1,33 +0,0 @@
// +build !linux
package nvml
// Initialize nvml library by locating nvml shared object file and calling ldopen
func (n *nvmlDriver) Initialize() error {
return UnavailableLib
}
// Shutdown stops any further interaction with nvml
func (n *nvmlDriver) Shutdown() error {
return UnavailableLib
}
// SystemDriverVersion returns installed driver version
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
return "", UnavailableLib
}
// DeviceCount reports number of available GPU devices
func (n *nvmlDriver) DeviceCount() (uint, error) {
return 0, UnavailableLib
}
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
return nil, UnavailableLib
}
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
return nil, nil, UnavailableLib
}

View File

@@ -1,85 +0,0 @@
package nvml
import (
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
)
// Initialize nvml library by locating nvml shared object file and calling ldopen
func (n *nvmlDriver) Initialize() error {
return nvml.Init()
}
// Shutdown stops any further interaction with nvml
func (n *nvmlDriver) Shutdown() error {
return nvml.Shutdown()
}
// SystemDriverVersion returns installed driver version
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
return nvml.GetDriverVersion()
}
// DeviceCount reports number of available GPU devices
func (n *nvmlDriver) DeviceCount() (uint, error) {
return nvml.GetDeviceCount()
}
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
device, err := nvml.NewDevice(index)
if err != nil {
return nil, err
}
deviceMode, err := device.GetDeviceMode()
if err != nil {
return nil, err
}
return &DeviceInfo{
UUID: device.UUID,
Name: device.Model,
MemoryMiB: device.Memory,
PowerW: device.Power,
BAR1MiB: device.PCI.BAR1,
PCIBandwidthMBPerS: device.PCI.Bandwidth,
PCIBusID: device.PCI.BusID,
CoresClockMHz: device.Clocks.Cores,
MemoryClockMHz: device.Clocks.Memory,
DisplayState: deviceMode.DisplayInfo.Mode.String(),
PersistenceMode: deviceMode.Persistence.String(),
}, nil
}
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
device, err := nvml.NewDevice(index)
if err != nil {
return nil, nil, err
}
status, err := device.Status()
if err != nil {
return nil, nil, err
}
return &DeviceInfo{
UUID: device.UUID,
Name: device.Model,
MemoryMiB: device.Memory,
PowerW: device.Power,
BAR1MiB: device.PCI.BAR1,
PCIBandwidthMBPerS: device.PCI.Bandwidth,
PCIBusID: device.PCI.BusID,
CoresClockMHz: device.Clocks.Cores,
MemoryClockMHz: device.Clocks.Memory,
}, &DeviceStatus{
TemperatureC: status.Temperature,
GPUUtilization: status.Utilization.GPU,
MemoryUtilization: status.Utilization.Memory,
EncoderUtilization: status.Utilization.Encoder,
DecoderUtilization: status.Utilization.Decoder,
UsedMemoryMiB: status.Memory.Global.Used,
ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache,
ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache,
ECCErrorsDevice: status.Memory.ECCErrors.Device,
PowerUsageW: status.Power,
BAR1UsedMiB: status.PCI.BAR1Used,
}, nil
}

View File

@@ -1,61 +0,0 @@
package nvml
import "errors"
var (
// UnavailableLib is returned when the nvml library could not be loaded.
UnavailableLib = errors.New("could not load NVML library")
)
// nvmlDriver implements NvmlDriver
// Users are required to call Initialize method before using any other methods
type nvmlDriver struct{}
// NvmlDriver represents set of methods to query nvml library
type NvmlDriver interface {
Initialize() error
Shutdown() error
SystemDriverVersion() (string, error)
DeviceCount() (uint, error)
DeviceInfoByIndex(uint) (*DeviceInfo, error)
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
}
// DeviceInfo represents nvml device data
// this struct is returned by NvmlDriver DeviceInfoByIndex and
// DeviceInfoAndStatusByIndex methods
type DeviceInfo struct {
// The following fields are guaranteed to be retrieved from nvml
UUID string
PCIBusID string
DisplayState string
PersistenceMode string
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
Name *string
MemoryMiB *uint64
PowerW *uint
BAR1MiB *uint64
PCIBandwidthMBPerS *uint
CoresClockMHz *uint
MemoryClockMHz *uint
}
// DeviceStatus represents nvml device status
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
type DeviceStatus struct {
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
PowerUsageW *uint
TemperatureC *uint
GPUUtilization *uint // %
MemoryUtilization *uint // %
EncoderUtilization *uint // %
DecoderUtilization *uint // %
BAR1UsedMiB *uint64
UsedMemoryMiB *uint64
ECCErrorsL1Cache *uint64
ECCErrorsL2Cache *uint64
ECCErrorsDevice *uint64
}

View File

@@ -1,325 +0,0 @@
package nvidia
import (
"context"
"time"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/shared/structs"
)
const (
// Attribute names for reporting stats output
PowerUsageAttr = "Power usage"
PowerUsageUnit = "W"
PowerUsageDesc = "Power usage for this GPU in watts and " +
"its associated circuitry (e.g. memory) / Maximum GPU Power"
GPUUtilizationAttr = "GPU utilization"
GPUUtilizationUnit = "%"
GPUUtilizationDesc = "Percent of time over the past sample period " +
"during which one or more kernels were executing on the GPU."
MemoryUtilizationAttr = "Memory utilization"
MemoryUtilizationUnit = "%"
MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period"
EncoderUtilizationAttr = "Encoder utilization"
EncoderUtilizationUnit = "%"
EncoderUtilizationDesc = "Percent of time over the past sample period " +
"during which GPU Encoder was used"
DecoderUtilizationAttr = "Decoder utilization"
DecoderUtilizationUnit = "%"
DecoderUtilizationDesc = "Percent of time over the past sample period " +
"during which GPU Decoder was used"
TemperatureAttr = "Temperature"
TemperatureUnit = "C" // Celsius degrees
TemperatureDesc = "Temperature of the Unit"
MemoryStateAttr = "Memory state"
MemoryStateUnit = "MiB" // Mebibytes
MemoryStateDesc = "UsedMemory / TotalMemory"
BAR1StateAttr = "BAR1 buffer state"
BAR1StateUnit = "MiB" // Mebibytes
BAR1StateDesc = "UsedBAR1 / TotalBAR1"
ECCErrorsL1CacheAttr = "ECC L1 errors"
ECCErrorsL1CacheUnit = "#" // number of errors
ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
ECCErrorsL2CacheAttr = "ECC L2 errors"
ECCErrorsL2CacheUnit = "#" // number of errors
ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
ECCErrorsDeviceAttr = "ECC memory errors"
ECCErrorsDeviceUnit = "#" // number of errors
ECCErrorsDeviceDesc = "Requested memory error counter for the device"
)
// stats is the long running goroutine that streams device statistics
func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) {
defer close(stats)
if d.initErr != nil {
if d.initErr.Error() != nvml.UnavailableLib.Error() {
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr)
stats <- device.NewStatsError(d.initErr)
}
return
}
// Create a timer that will fire immediately for the first detection
ticker := time.NewTimer(0)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ticker.Reset(interval)
}
d.writeStatsToChannel(stats, time.Now())
}
}
// filterStatsByID accepts list of StatsData and set of IDs
// this function would return entries from StatsData with IDs found in the set
func filterStatsByID(stats []*nvml.StatsData, ids map[string]struct{}) []*nvml.StatsData {
var filteredStats []*nvml.StatsData
for _, statsItem := range stats {
if _, ok := ids[statsItem.UUID]; ok {
filteredStats = append(filteredStats, statsItem)
}
}
return filteredStats
}
// writeStatsToChannel collects StatsData from NVML backend, groups StatsData
// by DeviceName attribute, populates DeviceGroupStats structure for every group
// and sends data over provided channel
func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
statsData, err := d.nvmlClient.GetStatsData()
if err != nil {
d.logger.Error("failed to get nvidia stats", "error", err)
stats <- &device.StatsResponse{
Error: err,
}
return
}
// filter only stats from devices that are stored in NvidiaDevice struct
d.deviceLock.RLock()
statsData = filterStatsByID(statsData, d.devices)
d.deviceLock.RUnlock()
// group stats by DeviceName struct field
statsListByDeviceName := make(map[string][]*nvml.StatsData)
for _, statsItem := range statsData {
deviceName := statsItem.DeviceName
if deviceName == nil {
// nvml driver was not able to detect device name. This kind
// of devices are placed to single group with 'notAvailable' name
notAvailableCopy := notAvailable
deviceName = &notAvailableCopy
}
statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
}
// place data device.DeviceGroupStats struct for every group of stats
deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
for groupName, groupStats := range statsListByDeviceName {
deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
}
stats <- &device.StatsResponse{
Groups: deviceGroupsStats,
}
}
func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue {
return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)}
}
// statsForGroup is a helper function that populates device.DeviceGroupStats
// for given groupName with groupStats list
func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
instanceStats := make(map[string]*device.DeviceStats)
for _, statsItem := range groupStats {
instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
}
return &device.DeviceGroupStats{
Vendor: vendor,
Type: deviceType,
Name: groupName,
InstanceStats: instanceStats,
}
}
// statsForItem is a helper function that populates device.DeviceStats for given
// nvml.StatsData
func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
// nvml.StatsData holds pointers to values that can be nil
// In case they are nil return stats with 'notAvailable' constant
var (
powerUsageStat *structs.StatValue
GPUUtilizationStat *structs.StatValue
memoryUtilizationStat *structs.StatValue
encoderUtilizationStat *structs.StatValue
decoderUtilizationStat *structs.StatValue
temperatureStat *structs.StatValue
memoryStateStat *structs.StatValue
BAR1StateStat *structs.StatValue
ECCErrorsL1CacheStat *structs.StatValue
ECCErrorsL2CacheStat *structs.StatValue
ECCErrorsDeviceStat *structs.StatValue
)
if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
} else {
powerUsageStat = &structs.StatValue{
Unit: PowerUsageUnit,
Desc: PowerUsageDesc,
IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)),
IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW),
}
}
if statsItem.GPUUtilization == nil {
GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
} else {
GPUUtilizationStat = &structs.StatValue{
Unit: GPUUtilizationUnit,
Desc: GPUUtilizationDesc,
IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization),
}
}
if statsItem.MemoryUtilization == nil {
memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
} else {
memoryUtilizationStat = &structs.StatValue{
Unit: MemoryUtilizationUnit,
Desc: MemoryUtilizationDesc,
IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization),
}
}
if statsItem.EncoderUtilization == nil {
encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
} else {
encoderUtilizationStat = &structs.StatValue{
Unit: EncoderUtilizationUnit,
Desc: EncoderUtilizationDesc,
IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization),
}
}
if statsItem.DecoderUtilization == nil {
decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
} else {
decoderUtilizationStat = &structs.StatValue{
Unit: DecoderUtilizationUnit,
Desc: DecoderUtilizationDesc,
IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization),
}
}
if statsItem.TemperatureC == nil {
temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
} else {
temperatureStat = &structs.StatValue{
Unit: TemperatureUnit,
Desc: TemperatureDesc,
IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC),
}
}
if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
} else {
memoryStateStat = &structs.StatValue{
Unit: MemoryStateUnit,
Desc: MemoryStateDesc,
IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB),
IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB),
}
}
if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
} else {
BAR1StateStat = &structs.StatValue{
Unit: BAR1StateUnit,
Desc: BAR1StateDesc,
IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB),
IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB),
}
}
if statsItem.ECCErrorsL1Cache == nil {
ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
} else {
ECCErrorsL1CacheStat = &structs.StatValue{
Unit: ECCErrorsL1CacheUnit,
Desc: ECCErrorsL1CacheDesc,
IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache),
}
}
if statsItem.ECCErrorsL2Cache == nil {
ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
} else {
ECCErrorsL2CacheStat = &structs.StatValue{
Unit: ECCErrorsL2CacheUnit,
Desc: ECCErrorsL2CacheDesc,
IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache),
}
}
if statsItem.ECCErrorsDevice == nil {
ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
} else {
ECCErrorsDeviceStat = &structs.StatValue{
Unit: ECCErrorsDeviceUnit,
Desc: ECCErrorsDeviceDesc,
IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice),
}
}
return &device.DeviceStats{
Summary: memoryStateStat,
Stats: &structs.StatObject{
Attributes: map[string]*structs.StatValue{
PowerUsageAttr: powerUsageStat,
GPUUtilizationAttr: GPUUtilizationStat,
MemoryUtilizationAttr: memoryUtilizationStat,
EncoderUtilizationAttr: encoderUtilizationStat,
DecoderUtilizationAttr: decoderUtilizationStat,
TemperatureAttr: temperatureStat,
MemoryStateAttr: memoryStateStat,
BAR1StateAttr: BAR1StateStat,
ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat,
ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat,
ECCErrorsDeviceAttr: ECCErrorsDeviceStat,
},
},
Timestamp: timestamp,
}
}
func uintToInt64Ptr(u *uint) *int64 {
if u == nil {
return nil
}
v := int64(*u)
return &v
}
func uint64ToInt64Ptr(u *uint64) *int64 {
if u == nil {
return nil
}
v := int64(*u)
return &v
}

File diff suppressed because it is too large Load Diff

2
go.mod
View File

@@ -19,7 +19,6 @@ require (
github.com/Azure/go-autorest/autorest/azure/auth v0.5.1 // indirect
github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5
github.com/Microsoft/go-winio v0.4.15-0.20200113171025-3fe6c5262873
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5
github.com/NYTimes/gziphandler v1.0.1
github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e
github.com/armon/go-metrics v0.3.4
@@ -32,6 +31,7 @@ require (
github.com/coreos/go-iptables v0.4.3-0.20190724151750-969b135e941d
github.com/coreos/go-semver v0.3.0
github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 // indirect
github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba // indirect
github.com/docker/cli v0.0.0-20200303215952-eb310fca4956
github.com/docker/distribution v2.7.1+incompatible
github.com/docker/docker v17.12.0-ce-rc1.0.20200330121334-7f8b4b621b5d+incompatible

5
go.sum
View File

@@ -64,8 +64,6 @@ github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5/go.mod h1:nxQPcN
github.com/Microsoft/hcsshim v0.8.7/go.mod h1:OHd7sQqRFrYd3RmSgbgji+ctCwkbq2wbEYNSzOYtcBQ=
github.com/Microsoft/hcsshim v0.8.9 h1:VrfodqvztU8YSOvygU+DN1BGaSGxmrNfqOv5oOuX2Bk=
github.com/Microsoft/hcsshim v0.8.9/go.mod h1:5692vkUqntj1idxauYlpoINNKeqCiG6Sg38RRsjT5y8=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA=
github.com/NYTimes/gziphandler v1.0.0 h1:OswZCvpiFsNRCbeapdJxDuikAqVXTgV7XAht8S9olZo=
github.com/NYTimes/gziphandler v1.0.0/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
@@ -182,8 +180,9 @@ github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1/go.mo
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/denverdino/aliyungo v0.0.0-20170926055100-d3308649c661 h1:lrWnAyy/F72MbxIxFUzKmcMCdt9Oi8RzpAxzTNQHD7o=
github.com/denverdino/aliyungo v0.0.0-20170926055100-d3308649c661/go.mod h1:dV8lFg6daOBZbT6/BDGIz6Y3WFGn8juu6G+CQ6LHtl0=
github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba h1:p6poVbjHDkKa+wtC8frBMwQtT3BmqGYBjzMwJ63tuR4=
github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba/go.mod h1:dV8lFg6daOBZbT6/BDGIz6Y3WFGn8juu6G+CQ6LHtl0=
github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/digitalocean/godo v1.7.5/go.mod h1:h6faOIcZ8lWIwNQ+DN7b3CgX4Kwby5T+nbpNqkUIozU=

View File

@@ -1,14 +0,0 @@
// +build !nonvidia
package catalog
import (
"github.com/hashicorp/nomad/devices/gpu/nvidia"
)
// This file is where all builtin plugins should be registered in the catalog.
// Plugins with build restrictions should be placed in the appropriate
// register_XXX.go file.
func init() {
Register(nvidia.PluginID, nvidia.PluginConfig)
}

View File

@@ -1,30 +1,29 @@
---
layout: docs
page_title: 'Device Plugins: Community Supported'
description: A list of community supported Device Plugins.
page_title: 'Device Plugins: External'
description: 'A list of external Device Plugins.'
---
# Community Supported
If you have authored a device plugin that you believe will be useful to the
broader Nomad community and you are committed to maintaining the plugin, please
file a PR to add your plugin to this page.
## Device Plugins
# External Device Plugins
Nomad has a plugin system for defining task drivers. External device driver
plugins will have the same user experience as built in devices.
Below is a list of community-supported task drivers you can use with Nomad:
Below is a list of official external task drivers you can use with Nomad:
- [Nvidia][nvidia]
## Community Supported
If you have authored a device plugin that you believe will be useful to the
broader Nomad community and you are committed to maintaining the plugin,
please file a PR to add your plugin to this page. For details on authoring a
device plugin, please refer to the [plugin authoring guide][plugin_guide].
Below is a list of community-support task drivers you can use with Nomad:
- [USB][usb]
## Authoring Device Plugins
Nomad has a plugin system for defining device drivers. External device plugins
will have the same user experience as built in drivers. For details on
authoring a device plugin, please refer to the [plugin authoring
guide][plugin_guide].
[plugin_guide]: /docs/internals/plugins
[nvidia]: /docs/devices/external/nvidia
[usb]: /docs/devices/external/usb

View File

@@ -6,18 +6,13 @@ description: Device Plugins are used to expose devices to tasks in Nomad.
# Device Plugins
Device plugins are used to detect and make devices available to tasks in Nomad.
Devices are physical hardware that exists on a node such as a GPU or an FPGA. By
having extensible device plugins, Nomad has the flexibility to support a broad
set of devices and allows the community to build additional device plugins as
needed.
Device plugins are used to detect and make devices available to tasks in
Nomad. Devices are physical hardware that exists on a client node such as a
GPU or an FPGA. By having extensible device plugins, Nomad has the flexibility
to support a broad set of devices and allows the community to build additional
device plugins as needed.
The list of supported device plugins is provided on the left of this page.
Each device plugin documents its configuration and installation requirements,
the attributes it fingerprints, and the environment variables it exposes to
tasks.
For details on authoring a device plugin, please refer to the [plugin authoring
guide][plugin_guide].
[plugin_guide]: /docs/internals/plugins

View File

@@ -1442,16 +1442,16 @@
"path": "devices"
},
{
"title": "Nvidia",
"path": "devices/nvidia"
},
{
"title": "Community",
"title": "External",
"routes": [
{
"title": "Overview",
"path": "devices/external"
},
{
"title": "Nvidia",
"path": "devices/external/nvidia"
},
{
"title": "USB <sup>Beta</sup>",
"path": "devices/external/usb"