Merge pull request #4638 from oleksii-shyman/nvidia-plugin

WIP :: Nvidia Plugin
This commit is contained in:
Alex Dadgar
2018-10-04 15:24:36 -07:00
committed by GitHub
21 changed files with 13221 additions and 1 deletions

View File

@@ -57,11 +57,16 @@ func Int64ToPtr(i int64) *int64 {
return &i
}
// UintToPtr returns the pointer to an uint
// Uint64ToPtr returns the pointer to an uint64
func Uint64ToPtr(u uint64) *uint64 {
return &u
}
// UintToPtr returns the pointer to an uint
func UintToPtr(u uint) *uint {
return &u
}
// StringToPtr returns the pointer to a string
func StringToPtr(str string) *string {
return &str

View File

@@ -0,0 +1,23 @@
This package provides an implementation of nvidia device plugin
# Behavior
Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period.
# Config
The configuration should be passed via an HCL file that begins with a top level `config` stanza:
```
config {
ignored_gpu_ids = ["uuid1", "uuid2"]
fingerprint_period = "5s"
stats_period = "5s"
}
```
The valid configuration options are:
* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
* `fingerprint_period` (`string`: `"5s"`): The interval to repeat fingerprint process to identify possible changes.
* `stats_period` (`string`: `"5s"`): The interval at which to emit statistics about the devices.

View File

@@ -0,0 +1,18 @@
package main
import (
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/plugins"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia"
)
func main() {
// Serve the plugin
plugins.Serve(factory)
}
// factory returns a new instance of the Nvidia GPU plugin
func factory(log log.Logger) interface{} {
return nvidia.NewNvidiaDevice(log)
}

View File

@@ -0,0 +1,209 @@
package nvidia
import (
"context"
"fmt"
"strings"
"sync"
"time"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
"github.com/hashicorp/nomad/plugins/shared/hclspec"
)
const (
// pluginName is the name of the plugin
pluginName = "nvidia-gpu"
// vendor is the vendor providing the devices
vendor = "nvidia"
// deviceType is the type of device being returned
deviceType = device.DeviceTypeGPU
// notAvailable value is returned to nomad server in case some properties were
// undetected by nvml driver
notAvailable = "N/A"
)
const (
// Nvidia-container-runtime environment variable names
nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
)
var (
// pluginInfo describes the plugin
pluginInfo = &base.PluginInfoResponse{
Type: base.PluginTypeDevice,
PluginApiVersion: "0.0.1", // XXX This should be an array and should be consts
PluginVersion: "0.1.0",
Name: pluginName,
}
// configSpec is the specification of the plugin's configuration
configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
"ignored_gpu_ids": hclspec.NewDefault(
hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
hclspec.NewLiteral("[]"),
),
"fingerprint_period": hclspec.NewDefault(
hclspec.NewAttr("fingerprint_period", "string", false),
hclspec.NewLiteral("\"5s\""),
),
"stats_period": hclspec.NewDefault(
hclspec.NewAttr("stats_period", "string", false),
hclspec.NewLiteral("\"5s\""),
),
})
)
// Config contains configuration information for the plugin.
type Config struct {
IgnoredGPUIDs []string `codec:"ignored_gpu_ids"`
FingerprintPeriod string `codec:"fingerprint_period"`
StatsPeriod string `codec:"stats_period"`
}
// NvidiaDevice contains all plugin specific data
type NvidiaDevice struct {
// nvmlClient is used to get data from nvidia
nvmlClient nvml.NvmlClient
// nvmlClientInitializationError holds an error retrieved during
// nvmlClient initialization
nvmlClientInitializationError error
// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
ignoredGPUIDs map[string]struct{}
// fingerprintPeriod is how often we should call nvml to get list of devices
fingerprintPeriod time.Duration
// statsPeriod is how often we should collect statistics for fingerprinted
// devices.
statsPeriod time.Duration
// devices is the set of detected eligible devices
devices map[string]struct{}
deviceLock sync.RWMutex
logger log.Logger
}
// NewNvidiaDevice returns a new nvidia device plugin.
func NewNvidiaDevice(log log.Logger) *NvidiaDevice {
nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient()
logger := log.Named(pluginName)
if nvmlClientInitializationError != nil {
logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError)
}
return &NvidiaDevice{
logger: logger,
devices: make(map[string]struct{}),
ignoredGPUIDs: make(map[string]struct{}),
nvmlClient: nvmlClient,
nvmlClientInitializationError: nvmlClientInitializationError,
}
}
// PluginInfo returns information describing the plugin.
func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
return pluginInfo, nil
}
// ConfigSchema returns the plugins configuration schema.
func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
return configSpec, nil
}
// SetConfig is used to set the configuration of the plugin.
func (d *NvidiaDevice) SetConfig(data []byte) error {
var config Config
if err := base.MsgPackDecode(data, &config); err != nil {
return err
}
for _, ignoredGPUId := range config.IgnoredGPUIDs {
d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
}
period, err := time.ParseDuration(config.FingerprintPeriod)
if err != nil {
return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
}
d.fingerprintPeriod = period
// Convert the stats period
speriod, err := time.ParseDuration(config.StatsPeriod)
if err != nil {
return fmt.Errorf("failed to parse stats period %q: %v", config.StatsPeriod, err)
}
d.statsPeriod = speriod
return nil
}
// Fingerprint streams detected devices. If device changes are detected or the
// devices health changes, messages will be emitted.
func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
outCh := make(chan *device.FingerprintResponse)
go d.fingerprint(ctx, outCh)
return outCh, nil
}
type reservationError struct {
notExistingIDs []string
}
func (e *reservationError) Error() string {
return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
}
// Reserve returns information on how to mount given devices.
// Assumption is made that nomad server is responsible for correctness of
// GPU allocations, handling tricky cases such as double-allocation of single GPU
func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
if len(deviceIDs) == 0 {
return &device.ContainerReservation{}, nil
}
// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
// of race condition
//
// Timeline:
// 1 - fingerprint reports that GPU with id "1" is present
// 2 - the following events happen at the same time:
// a) server decides to allocate GPU with id "1"
// b) fingerprint check reports that GPU with id "1" is no more present
//
// The latest and always valid version of fingerprinted ids are stored in
// d.devices map. To avoid this race condition an error is returned if
// any of provided deviceIDs is not found in d.devices map
d.deviceLock.RLock()
var notExistingIDs []string
for _, id := range deviceIDs {
if _, deviceIDExists := d.devices[id]; !deviceIDExists {
notExistingIDs = append(notExistingIDs, id)
}
}
d.deviceLock.RUnlock()
if len(notExistingIDs) != 0 {
return nil, &reservationError{notExistingIDs}
}
return &device.ContainerReservation{
Envs: map[string]string{
nvidiaVisibleDevices: strings.Join(deviceIDs, ","),
},
}, nil
}
// Stats streams statistics for the detected devices.
func (d *NvidiaDevice) Stats(ctx context.Context) (<-chan *device.StatsResponse, error) {
outCh := make(chan *device.StatsResponse)
go d.stats(ctx, outCh)
return outCh, nil
}

View File

@@ -0,0 +1,115 @@
package nvidia
import (
"testing"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/plugins/device"
"github.com/stretchr/testify/require"
)
type MockNvmlClient struct {
FingerprintError error
FingerprintResponseReturned *nvml.FingerprintData
StatsError error
StatsResponseReturned []*nvml.StatsData
}
func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) {
return c.FingerprintResponseReturned, c.FingerprintError
}
func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
return c.StatsResponseReturned, c.StatsError
}
func TestReserve(t *testing.T) {
for _, testCase := range []struct {
Name string
ExpectedReservation *device.ContainerReservation
ExpectedError error
Device *NvidiaDevice
RequestedIDs []string
}{
{
Name: "All RequestedIDs are not managed by Device",
ExpectedReservation: nil,
ExpectedError: &reservationError{[]string{
"UUID1",
"UUID2",
"UUID3",
}},
RequestedIDs: []string{
"UUID1",
"UUID2",
"UUID3",
},
Device: &NvidiaDevice{
logger: hclog.NewNullLogger(),
},
},
{
Name: "Some RequestedIDs are not managed by Device",
ExpectedReservation: nil,
ExpectedError: &reservationError{[]string{
"UUID1",
"UUID2",
}},
RequestedIDs: []string{
"UUID1",
"UUID2",
"UUID3",
},
Device: &NvidiaDevice{
devices: map[string]struct{}{
"UUID3": {},
},
logger: hclog.NewNullLogger(),
},
},
{
Name: "All RequestedIDs are managed by Device",
ExpectedReservation: &device.ContainerReservation{
Envs: map[string]string{
nvidiaVisibleDevices: "UUID1,UUID2,UUID3",
},
},
ExpectedError: nil,
RequestedIDs: []string{
"UUID1",
"UUID2",
"UUID3",
},
Device: &NvidiaDevice{
devices: map[string]struct{}{
"UUID1": {},
"UUID2": {},
"UUID3": {},
},
logger: hclog.NewNullLogger(),
},
},
{
Name: "No IDs requested",
ExpectedReservation: &device.ContainerReservation{},
ExpectedError: nil,
RequestedIDs: nil,
Device: &NvidiaDevice{
devices: map[string]struct{}{
"UUID1": {},
"UUID2": {},
"UUID3": {},
},
logger: hclog.NewNullLogger(),
},
},
} {
actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs)
req := require.New(t)
req.Equal(testCase.ExpectedReservation, actualReservation)
req.Equal(testCase.ExpectedError, actualError)
}
}

View File

@@ -0,0 +1,235 @@
package nvidia
import (
"context"
"fmt"
"time"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
)
const (
// Attribute names for reporting Fingerprint output
MemoryMiBAttr = "memory_mib"
PowerWAttr = "power_w"
BAR1MiBAttr = "bar1_mib"
DriverVersionAttr = "driver_version"
CoresClockMHzAttr = "cores_clock_mhz"
MemoryClockMHzAttr = "memory_clock_mhz"
PCIBandwidthMBPerSAttr = "pci_bandwidth_mb/s"
DisplayStateAttr = "display_state"
PersistenceModeAttr = "persistence_mode"
)
// fingerprint is the long running goroutine that detects hardware
func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
defer close(devices)
if d.nvmlClientInitializationError != nil {
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError)
// write empty fingerprint response to let server know that there are
// no working Nvidia GPU units
devices <- device.NewFingerprint()
return
}
// Create a timer that will fire immediately for the first detection
ticker := time.NewTimer(0)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ticker.Reset(d.fingerprintPeriod)
}
d.writeFingerprintToChannel(devices)
}
}
// writeFingerprintToChannel makes nvml call and writes response to channel
func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
fingerprintData, err := d.nvmlClient.GetFingerprintData()
if err != nil {
d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
devices <- device.NewFingerprintError(err)
return
}
// ignore devices from fingerprint output
fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
// check if any device health was updated or any device was added to host
if !d.fingerprintChanged(fingerprintDevices) {
return
}
commonAttributes := map[string]string{
DriverVersionAttr: fingerprintData.DriverVersion,
}
// Group all FingerprintDevices by DeviceName attribute
deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
for _, device := range fingerprintDevices {
deviceName := device.DeviceName
if deviceName == nil {
// nvml driver was not able to detect device name. This kind
// of devices are placed to single group with 'notAvailable' name
notAvailableCopy := notAvailable
deviceName = &notAvailableCopy
}
deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
}
// Build Fingerprint response with computed groups and send it over the channel
deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
for groupName, devices := range deviceListByDeviceName {
deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
}
devices <- device.NewFingerprint(deviceGroups...)
}
// ignoreFingerprintedDevices excludes ignored devices from fingerprint output
func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
var result []*nvml.FingerprintDeviceData
for _, fingerprintDevice := range deviceData {
if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
result = append(result, fingerprintDevice)
}
}
return result
}
// fingerprintChanged checks if there are any previously unseen nvidia devices located
// or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
// Also, this func updates device map on NvidiaDevice with the latest data
func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
d.deviceLock.Lock()
defer d.deviceLock.Unlock()
changeDetected := false
// check if every device in allDevices is in d.devices
for _, device := range allDevices {
if _, ok := d.devices[device.UUID]; !ok {
changeDetected = true
}
}
// check if every device in d.devices is in allDevices
fingerprintDeviceMap := make(map[string]struct{})
for _, device := range allDevices {
fingerprintDeviceMap[device.UUID] = struct{}{}
}
for id := range d.devices {
if _, ok := fingerprintDeviceMap[id]; !ok {
changeDetected = true
}
}
d.devices = fingerprintDeviceMap
return changeDetected
}
// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]string) *device.DeviceGroup {
// deviceGroup without devices makes no sense -> return nil when no devices are provided
if len(deviceList) == 0 {
return nil
}
devices := make([]*device.Device, len(deviceList))
for index, dev := range deviceList {
devices[index] = &device.Device{
ID: dev.UUID,
// all fingerprinted devices are "healthy" for now
// to get real health data -> dcgm bindings should be used
Healthy: true,
HwLocality: &device.DeviceLocality{
PciBusID: dev.PCIBusID,
},
}
}
deviceGroup := &device.DeviceGroup{
Vendor: vendor,
Type: deviceType,
Name: groupName,
Devices: devices,
// Assumption made that devices with the same DeviceName have the same
// attributes like amount of memory, power, bar1memory etc
Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
}
// Extend attribute map with common attributes
for attributeKey, attributeValue := range commonAttributes {
deviceGroup.Attributes[attributeKey] = attributeValue
}
return deviceGroup
}
// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
// struct to device.DeviceGroup.Attributes format (map[string]string)
// this function performs all nil checks for FingerprintDeviceData pointers
func attributesFromFingerprintDeviceData(fingerprintDeviceData *nvml.FingerprintDeviceData) map[string]string {
// The following fields in FingerprintDeviceData are pointers, so they can be nil
// In case they are nil -> return 'notAvailable' constant instead
var (
MemoryMiB string
PowerW string
BAR1MiB string
CoresClockMHz string
MemoryClockMHz string
PCIBandwidthMBPerS string
)
if fingerprintDeviceData.MemoryMiB == nil {
MemoryMiB = notAvailable
} else {
MemoryMiB = fmt.Sprint(*fingerprintDeviceData.MemoryMiB)
}
if fingerprintDeviceData.PowerW == nil {
PowerW = notAvailable
} else {
PowerW = fmt.Sprint(*fingerprintDeviceData.PowerW)
}
if fingerprintDeviceData.BAR1MiB == nil {
BAR1MiB = notAvailable
} else {
BAR1MiB = fmt.Sprint(*fingerprintDeviceData.BAR1MiB)
}
if fingerprintDeviceData.CoresClockMHz == nil {
CoresClockMHz = notAvailable
} else {
CoresClockMHz = fmt.Sprint(*fingerprintDeviceData.CoresClockMHz)
}
if fingerprintDeviceData.MemoryClockMHz == nil {
MemoryClockMHz = notAvailable
} else {
MemoryClockMHz = fmt.Sprint(*fingerprintDeviceData.MemoryClockMHz)
}
if fingerprintDeviceData.PCIBandwidthMBPerS == nil {
PCIBandwidthMBPerS = notAvailable
} else {
PCIBandwidthMBPerS = fmt.Sprint(*fingerprintDeviceData.PCIBandwidthMBPerS)
}
return map[string]string{
DisplayStateAttr: fingerprintDeviceData.DisplayState,
PersistenceModeAttr: fingerprintDeviceData.PersistenceMode,
MemoryMiBAttr: MemoryMiB,
PowerWAttr: PowerW,
BAR1MiBAttr: BAR1MiB,
CoresClockMHzAttr: CoresClockMHz,
MemoryClockMHzAttr: MemoryClockMHz,
PCIBandwidthMBPerSAttr: PCIBandwidthMBPerS,
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,194 @@
package nvml
import (
"fmt"
)
// DeviceData represents common fields for Nvidia device
type DeviceData struct {
UUID string
DeviceName *string
MemoryMiB *uint64
PowerW *uint
BAR1MiB *uint64
}
// FingerprintDeviceData is a superset of DeviceData
// it describes device specific fields returned from
// nvml queries during fingerprinting call
type FingerprintDeviceData struct {
*DeviceData
PCIBandwidthMBPerS *uint
CoresClockMHz *uint
MemoryClockMHz *uint
DisplayState string
PersistenceMode string
PCIBusID string
}
// FingerprintData represets attributes of driver/devices
type FingerprintData struct {
Devices []*FingerprintDeviceData
DriverVersion string
}
// StatsData is a superset of DeviceData
// it represents statistics data returned for every Nvidia device
type StatsData struct {
*DeviceData
PowerUsageW *uint
GPUUtilization *uint
MemoryUtilization *uint
EncoderUtilization *uint
DecoderUtilization *uint
TemperatureC *uint
UsedMemoryMiB *uint64
BAR1UsedMiB *uint64
ECCErrorsL1Cache *uint64
ECCErrorsL2Cache *uint64
ECCErrorsDevice *uint64
}
// NvmlClient describes how users would use nvml library
type NvmlClient interface {
GetFingerprintData() (*FingerprintData, error)
GetStatsData() ([]*StatsData, error)
}
// nvmlClient implements NvmlClient
// Users of this lib are expected to use this struct via NewNvmlClient func
type nvmlClient struct {
driver NvmlDriver
}
// NewNvmlClient function creates new nvmlClient with real
// NvmlDriver implementation. Also, this func initializes NvmlDriver
func NewNvmlClient() (*nvmlClient, error) {
driver := &nvmlDriver{}
err := driver.Initialize()
if err != nil {
return nil, err
}
return &nvmlClient{
driver: driver,
}, nil
}
// GetFingerprintData returns FingerprintData for available Nvidia devices
func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
/*
nvml fields to be fingerprinted # nvml_library_call
1 - Driver Version # nvmlSystemGetDriverVersion
2 - Product Name # nvmlDeviceGetName
3 - GPU UUID # nvmlDeviceGetUUID
4 - Total Memory # nvmlDeviceGetMemoryInfo
5 - Power # nvmlDeviceGetPowerManagementLimit
6 - PCIBusID # nvmlDeviceGetPciInfo
7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo(
8 - PCI Bandwidth
9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo
10 - Display Mode # nvmlDeviceGetDisplayMode
11 - Persistence Mode # nvmlDeviceGetPersistenceMode
*/
// Assumed that this method is called with receiver retrieved from
// NewNvmlClient
// because this method handles initialization of NVML library
driverVersion, err := c.driver.SystemDriverVersion()
if err != nil {
return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
}
numDevices, err := c.driver.DeviceCount()
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
}
allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
for i := 0; i < int(numDevices); i++ {
deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
}
allNvidiaGPUResources[i] = &FingerprintDeviceData{
DeviceData: &DeviceData{
DeviceName: deviceInfo.Name,
UUID: deviceInfo.UUID,
MemoryMiB: deviceInfo.MemoryMiB,
PowerW: deviceInfo.PowerW,
BAR1MiB: deviceInfo.BAR1MiB,
},
PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
CoresClockMHz: deviceInfo.CoresClockMHz,
MemoryClockMHz: deviceInfo.MemoryClockMHz,
DisplayState: deviceInfo.DisplayState,
PersistenceMode: deviceInfo.PersistenceMode,
PCIBusID: deviceInfo.PCIBusID,
}
}
return &FingerprintData{
Devices: allNvidiaGPUResources,
DriverVersion: driverVersion,
}, nil
}
// GetStatsData returns statistics data for all devices on this machine
func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
/*
nvml fields to be reported to stats api # nvml_library_call
1 - Used Memory # nvmlDeviceGetMemoryInfo
2 - Utilization of GPU # nvmlDeviceGetUtilizationRates
3 - Utilization of Memory # nvmlDeviceGetUtilizationRates
4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization
5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization
6 - Current GPU Temperature # nvmlDeviceGetTemperature
7 - Power Draw # nvmlDeviceGetPowerUsage
8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo
9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter
10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter
11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
*/
// Assumed that this method is called with receiver retrieved from
// NewNvmlClient
// because this method handles initialization of NVML library
numDevices, err := c.driver.DeviceCount()
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
}
allNvidiaGPUStats := make([]*StatsData, numDevices)
for i := 0; i < int(numDevices); i++ {
deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
}
allNvidiaGPUStats[i] = &StatsData{
DeviceData: &DeviceData{
DeviceName: deviceInfo.Name,
UUID: deviceInfo.UUID,
MemoryMiB: deviceInfo.MemoryMiB,
PowerW: deviceInfo.PowerW,
BAR1MiB: deviceInfo.BAR1MiB,
},
PowerUsageW: deviceStatus.PowerUsageW,
GPUUtilization: deviceStatus.GPUUtilization,
MemoryUtilization: deviceStatus.MemoryUtilization,
EncoderUtilization: deviceStatus.EncoderUtilization,
DecoderUtilization: deviceStatus.DecoderUtilization,
TemperatureC: deviceStatus.TemperatureC,
UsedMemoryMiB: deviceStatus.UsedMemoryMiB,
BAR1UsedMiB: deviceStatus.BAR1UsedMiB,
ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache,
ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache,
ECCErrorsDevice: deviceStatus.ECCErrorsDevice,
}
}
return allNvidiaGPUStats, nil
}

View File

@@ -0,0 +1,399 @@
package nvml
import (
"errors"
"testing"
"github.com/hashicorp/nomad/helper"
"github.com/stretchr/testify/require"
)
type MockNVMLDriver struct {
systemDriverCallSuccessful bool
deviceCountCallSuccessful bool
deviceInfoByIndexCallSuccessful bool
deviceInfoAndStatusByIndexCallSuccessful bool
driverVersion string
devices []*DeviceInfo
deviceStatus []*DeviceStatus
}
func (m *MockNVMLDriver) Initialize() error {
return nil
}
func (m *MockNVMLDriver) Shutdown() error {
return nil
}
func (m *MockNVMLDriver) SystemDriverVersion() (string, error) {
if !m.systemDriverCallSuccessful {
return "", errors.New("failed to get system driver")
}
return m.driverVersion, nil
}
func (m *MockNVMLDriver) DeviceCount() (uint, error) {
if !m.deviceCountCallSuccessful {
return 0, errors.New("failed to get device length")
}
return uint(len(m.devices)), nil
}
func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
if index >= uint(len(m.devices)) {
return nil, errors.New("index is out of range")
}
if !m.deviceInfoByIndexCallSuccessful {
return nil, errors.New("failed to get device info by index")
}
return m.devices[index], nil
}
func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) {
return nil, nil, errors.New("index is out of range")
}
if !m.deviceInfoAndStatusByIndexCallSuccessful {
return nil, nil, errors.New("failed to get device info and status by index")
}
return m.devices[index], m.deviceStatus[index], nil
}
func TestGetFingerprintDataFromNVML(t *testing.T) {
for _, testCase := range []struct {
Name string
DriverConfiguration *MockNVMLDriver
ExpectedError bool
ExpectedResult *FingerprintData
}{
{
Name: "fail on systemDriverCallSuccessful",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: false,
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: true,
},
},
{
Name: "fail on deviceCountCallSuccessful",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: false,
deviceInfoByIndexCallSuccessful: true,
},
},
{
Name: "fail on deviceInfoByIndexCall",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: false,
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
},
},
},
},
{
Name: "successful outcome",
ExpectedError: false,
ExpectedResult: &FingerprintData{
DriverVersion: "driverVersion",
Devices: []*FingerprintDeviceData{
{
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName1"),
UUID: "UUID1",
MemoryMiB: helper.Uint64ToPtr(16),
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
},
PCIBusID: "busId1",
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
}, {
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName2"),
UUID: "UUID2",
MemoryMiB: helper.Uint64ToPtr(8),
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
},
PCIBusID: "busId2",
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
},
},
},
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: true,
driverVersion: "driverVersion",
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId1",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId2",
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
DisplayState: "Enabled",
PersistenceMode: "Enabled",
},
},
},
},
} {
cli := nvmlClient{driver: testCase.DriverConfiguration}
fingerprintData, err := cli.GetFingerprintData()
if testCase.ExpectedError && err == nil {
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
}
if !testCase.ExpectedError && err != nil {
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
}
require.New(t).Equal(testCase.ExpectedResult, fingerprintData)
}
}
func TestGetStatsDataFromNVML(t *testing.T) {
for _, testCase := range []struct {
Name string
DriverConfiguration *MockNVMLDriver
ExpectedError bool
ExpectedResult []*StatsData
}{
{
Name: "fail on deviceCountCallSuccessful",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: false,
deviceInfoByIndexCallSuccessful: true,
deviceInfoAndStatusByIndexCallSuccessful: true,
},
},
{
Name: "fail on DeviceInfoAndStatusByIndex call",
ExpectedError: true,
ExpectedResult: nil,
DriverConfiguration: &MockNVMLDriver{
systemDriverCallSuccessful: true,
deviceCountCallSuccessful: true,
deviceInfoAndStatusByIndexCallSuccessful: false,
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId1",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId2",
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
},
},
deviceStatus: []*DeviceStatus{
{
TemperatureC: helper.UintToPtr(1),
GPUUtilization: helper.UintToPtr(1),
MemoryUtilization: helper.UintToPtr(1),
EncoderUtilization: helper.UintToPtr(1),
DecoderUtilization: helper.UintToPtr(1),
UsedMemoryMiB: helper.Uint64ToPtr(1),
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
ECCErrorsDevice: helper.Uint64ToPtr(1),
PowerUsageW: helper.UintToPtr(1),
BAR1UsedMiB: helper.Uint64ToPtr(1),
},
{
TemperatureC: helper.UintToPtr(2),
GPUUtilization: helper.UintToPtr(2),
MemoryUtilization: helper.UintToPtr(2),
EncoderUtilization: helper.UintToPtr(2),
DecoderUtilization: helper.UintToPtr(2),
UsedMemoryMiB: helper.Uint64ToPtr(2),
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
ECCErrorsDevice: helper.Uint64ToPtr(2),
PowerUsageW: helper.UintToPtr(2),
BAR1UsedMiB: helper.Uint64ToPtr(2),
},
},
},
},
{
Name: "successful outcome",
ExpectedError: false,
ExpectedResult: []*StatsData{
{
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName1"),
UUID: "UUID1",
MemoryMiB: helper.Uint64ToPtr(16),
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
},
TemperatureC: helper.UintToPtr(1),
GPUUtilization: helper.UintToPtr(1),
MemoryUtilization: helper.UintToPtr(1),
EncoderUtilization: helper.UintToPtr(1),
DecoderUtilization: helper.UintToPtr(1),
UsedMemoryMiB: helper.Uint64ToPtr(1),
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
ECCErrorsDevice: helper.Uint64ToPtr(1),
PowerUsageW: helper.UintToPtr(1),
BAR1UsedMiB: helper.Uint64ToPtr(1),
},
{
DeviceData: &DeviceData{
DeviceName: helper.StringToPtr("ModelName2"),
UUID: "UUID2",
MemoryMiB: helper.Uint64ToPtr(8),
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
},
TemperatureC: helper.UintToPtr(2),
GPUUtilization: helper.UintToPtr(2),
MemoryUtilization: helper.UintToPtr(2),
EncoderUtilization: helper.UintToPtr(2),
DecoderUtilization: helper.UintToPtr(2),
UsedMemoryMiB: helper.Uint64ToPtr(2),
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
ECCErrorsDevice: helper.Uint64ToPtr(2),
PowerUsageW: helper.UintToPtr(2),
BAR1UsedMiB: helper.Uint64ToPtr(2),
},
},
DriverConfiguration: &MockNVMLDriver{
deviceCountCallSuccessful: true,
deviceInfoByIndexCallSuccessful: true,
deviceInfoAndStatusByIndexCallSuccessful: true,
devices: []*DeviceInfo{
{
UUID: "UUID1",
Name: helper.StringToPtr("ModelName1"),
MemoryMiB: helper.Uint64ToPtr(16),
PCIBusID: "busId1",
PowerW: helper.UintToPtr(100),
BAR1MiB: helper.Uint64ToPtr(100),
PCIBandwidthMBPerS: helper.UintToPtr(100),
CoresClockMHz: helper.UintToPtr(100),
MemoryClockMHz: helper.UintToPtr(100),
}, {
UUID: "UUID2",
Name: helper.StringToPtr("ModelName2"),
MemoryMiB: helper.Uint64ToPtr(8),
PCIBusID: "busId2",
PowerW: helper.UintToPtr(200),
BAR1MiB: helper.Uint64ToPtr(200),
PCIBandwidthMBPerS: helper.UintToPtr(200),
CoresClockMHz: helper.UintToPtr(200),
MemoryClockMHz: helper.UintToPtr(200),
},
},
deviceStatus: []*DeviceStatus{
{
TemperatureC: helper.UintToPtr(1),
GPUUtilization: helper.UintToPtr(1),
MemoryUtilization: helper.UintToPtr(1),
EncoderUtilization: helper.UintToPtr(1),
DecoderUtilization: helper.UintToPtr(1),
UsedMemoryMiB: helper.Uint64ToPtr(1),
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
ECCErrorsDevice: helper.Uint64ToPtr(1),
PowerUsageW: helper.UintToPtr(1),
BAR1UsedMiB: helper.Uint64ToPtr(1),
},
{
TemperatureC: helper.UintToPtr(2),
GPUUtilization: helper.UintToPtr(2),
MemoryUtilization: helper.UintToPtr(2),
EncoderUtilization: helper.UintToPtr(2),
DecoderUtilization: helper.UintToPtr(2),
UsedMemoryMiB: helper.Uint64ToPtr(2),
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
ECCErrorsDevice: helper.Uint64ToPtr(2),
PowerUsageW: helper.UintToPtr(2),
BAR1UsedMiB: helper.Uint64ToPtr(2),
},
},
},
},
} {
cli := nvmlClient{driver: testCase.DriverConfiguration}
statsData, err := cli.GetStatsData()
if testCase.ExpectedError && err == nil {
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
}
if !testCase.ExpectedError && err != nil {
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
}
require.New(t).Equal(testCase.ExpectedResult, statsData)
}
}

View File

@@ -0,0 +1,138 @@
package nvml
import (
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
)
// DeviceInfo represents nvml device data
// this struct is returned by NvmlDriver DeviceInfoByIndex and
// DeviceInfoAndStatusByIndex methods
type DeviceInfo struct {
// The following fields are guaranteed to be retrieved from nvml
UUID string
PCIBusID string
DisplayState string
PersistenceMode string
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
Name *string
MemoryMiB *uint64
PowerW *uint
BAR1MiB *uint64
PCIBandwidthMBPerS *uint
CoresClockMHz *uint
MemoryClockMHz *uint
}
// DeviceStatus represents nvml device status
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
type DeviceStatus struct {
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
PowerUsageW *uint
TemperatureC *uint
GPUUtilization *uint // %
MemoryUtilization *uint // %
EncoderUtilization *uint // %
DecoderUtilization *uint // %
BAR1UsedMiB *uint64
UsedMemoryMiB *uint64
ECCErrorsL1Cache *uint64
ECCErrorsL2Cache *uint64
ECCErrorsDevice *uint64
}
// NvmlDriver represents set of methods to query nvml library
type NvmlDriver interface {
Initialize() error
Shutdown() error
SystemDriverVersion() (string, error)
DeviceCount() (uint, error)
DeviceInfoByIndex(uint) (*DeviceInfo, error)
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
}
// nvmlDriver implements NvmlDriver
// Users are required to call Initialize method before using any other methods
type nvmlDriver struct{}
// Initialize nvml library by locating nvml shared object file and calling ldopen
func (n *nvmlDriver) Initialize() error {
return nvml.Init()
}
// Shutdown stops any further interaction with nvml
func (n *nvmlDriver) Shutdown() error {
return nvml.Shutdown()
}
// SystemDriverVersion returns installed driver version
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
return nvml.GetDriverVersion()
}
// DeviceCount reports number of available GPU devices
func (n *nvmlDriver) DeviceCount() (uint, error) {
return nvml.GetDeviceCount()
}
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
device, err := nvml.NewDevice(index)
if err != nil {
return nil, err
}
deviceMode, err := device.GetDeviceMode()
if err != nil {
return nil, err
}
return &DeviceInfo{
UUID: device.UUID,
Name: device.Model,
MemoryMiB: device.Memory,
PowerW: device.Power,
BAR1MiB: device.PCI.BAR1,
PCIBandwidthMBPerS: device.PCI.Bandwidth,
PCIBusID: device.PCI.BusID,
CoresClockMHz: device.Clocks.Cores,
MemoryClockMHz: device.Clocks.Memory,
DisplayState: deviceMode.DisplayInfo.Mode.String(),
PersistenceMode: deviceMode.Persistence.String(),
}, nil
}
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
device, err := nvml.NewDevice(index)
if err != nil {
return nil, nil, err
}
status, err := device.Status()
if err != nil {
return nil, nil, err
}
return &DeviceInfo{
UUID: device.UUID,
Name: device.Model,
MemoryMiB: device.Memory,
PowerW: device.Power,
BAR1MiB: device.PCI.BAR1,
PCIBandwidthMBPerS: device.PCI.Bandwidth,
PCIBusID: device.PCI.BusID,
CoresClockMHz: device.Clocks.Cores,
MemoryClockMHz: device.Clocks.Memory,
}, &DeviceStatus{
TemperatureC: status.Temperature,
GPUUtilization: status.Utilization.GPU,
MemoryUtilization: status.Utilization.Memory,
EncoderUtilization: status.Utilization.Encoder,
DecoderUtilization: status.Utilization.Decoder,
UsedMemoryMiB: status.Memory.Global.Used,
ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache,
ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache,
ECCErrorsDevice: status.Memory.ECCErrors.Device,
PowerUsageW: status.Power,
BAR1UsedMiB: status.PCI.BAR1Used,
}, nil
}

View File

@@ -0,0 +1,301 @@
package nvidia
import (
"context"
"time"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
)
const (
// Attribute names for reporting stats output
PowerUsageAttr = "Power usage"
PowerUsageUnit = "W"
PowerUsageDesc = "Power usage for this GPU in watts and " +
"its associated circuitry (e.g. memory) / Maximum GPU Power"
GPUUtilizationAttr = "GPU utilization"
GPUUtilizationUnit = "%"
GPUUtilizationDesc = "Percent of time over the past sample period " +
"during which one or more kernels were executing on the GPU."
MemoryUtilizationAttr = "Memory utilization"
MemoryUtilizationUnit = "%"
MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period"
EncoderUtilizationAttr = "Encoder utilization"
EncoderUtilizationUnit = "%"
EncoderUtilizationDesc = "Percent of time over the past sample period " +
"during which GPU Encoder was used"
DecoderUtilizationAttr = "Decoder utilization"
DecoderUtilizationUnit = "%"
DecoderUtilizationDesc = "Percent of time over the past sample period " +
"during which GPU Decoder was used"
TemperatureAttr = "Temperature"
TemperatureUnit = "C" // Celsius degrees
TemperatureDesc = "Temperature of the Unit"
MemoryStateAttr = "Memory state"
MemoryStateUnit = "MiB" // Mebibytes
MemoryStateDesc = "UsedMemory / TotalMemory"
BAR1StateAttr = "BAR1 buffer state"
BAR1StateUnit = "MiB" // Mebibytes
BAR1StateDesc = "UsedBAR1 / TotalBAR1"
ECCErrorsL1CacheAttr = "ECC L1 errors"
ECCErrorsL1CacheUnit = "#" // number of errors
ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
ECCErrorsL2CacheAttr = "ECC L2 errors"
ECCErrorsL2CacheUnit = "#" // number of errors
ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
ECCErrorsDeviceAttr = "ECC memory errors"
ECCErrorsDeviceUnit = "#" // number of errors
ECCErrorsDeviceDesc = "Requested memory error counter for the device"
)
// stats is the long running goroutine that streams device statistics
func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) {
defer close(stats)
if d.nvmlClientInitializationError != nil {
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError)
return
}
// Create a timer that will fire immediately for the first detection
ticker := time.NewTimer(0)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ticker.Reset(d.statsPeriod)
}
d.writeStatsToChannel(stats, time.Now())
}
}
// filterStatsByID accepts list of StatsData and set of IDs
// this function would return entries from StatsData with IDs found in the set
func filterStatsByID(stats []*nvml.StatsData, IDs map[string]struct{}) []*nvml.StatsData {
var filteredStats []*nvml.StatsData
for _, statsItem := range stats {
if _, ok := IDs[statsItem.UUID]; ok {
filteredStats = append(filteredStats, statsItem)
}
}
return filteredStats
}
// writeStatsToChannel collects StatsData from NVML backend, groups StatsData
// by DeviceName attribute, populates DeviceGroupStats structure for every group
// and sends data over provided channel
func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
statsData, err := d.nvmlClient.GetStatsData()
if err != nil {
d.logger.Error("failed to get nvidia stats", "error", err)
stats <- &device.StatsResponse{
Error: err,
}
return
}
// filter only stats from devices that are stored in NvidiaDevice struct
d.deviceLock.RLock()
statsData = filterStatsByID(statsData, d.devices)
d.deviceLock.RUnlock()
// group stats by DeviceName struct field
statsListByDeviceName := make(map[string][]*nvml.StatsData)
for _, statsItem := range statsData {
deviceName := statsItem.DeviceName
if deviceName == nil {
// nvml driver was not able to detect device name. This kind
// of devices are placed to single group with 'notAvailable' name
notAvailableCopy := notAvailable
deviceName = &notAvailableCopy
}
statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
}
// place data device.DeviceGroupStats struct for every group of stats
deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
for groupName, groupStats := range statsListByDeviceName {
deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
}
stats <- &device.StatsResponse{
Groups: deviceGroupsStats,
}
}
func newNotAvailableDeviceStats(unit, desc string) *device.StatValue {
return &device.StatValue{Unit: unit, Desc: desc, StringVal: notAvailable}
}
// statsForGroup is a helper function that populates device.DeviceGroupStats
// for given groupName with groupStats list
func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
instanceStats := make(map[string]*device.DeviceStats)
for _, statsItem := range groupStats {
instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
}
return &device.DeviceGroupStats{
Vendor: vendor,
Type: deviceType,
Name: groupName,
InstanceStats: instanceStats,
}
}
// statsForItem is a helper function that populates device.DeviceStats for given
// nvml.StatsData
func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
// nvml.StatsData holds pointers to values that can be nil
// In case they are nil return stats with 'notAvailable' constant
var (
powerUsageStat *device.StatValue
GPUUtilizationStat *device.StatValue
memoryUtilizationStat *device.StatValue
encoderUtilizationStat *device.StatValue
decoderUtilizationStat *device.StatValue
temperatureStat *device.StatValue
memoryStateStat *device.StatValue
BAR1StateStat *device.StatValue
ECCErrorsL1CacheStat *device.StatValue
ECCErrorsL2CacheStat *device.StatValue
ECCErrorsDeviceStat *device.StatValue
)
if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
} else {
powerUsageStat = &device.StatValue{
Unit: PowerUsageUnit,
Desc: PowerUsageDesc,
IntNumeratorVal: int64(*statsItem.PowerUsageW),
IntDenominatorVal: int64(*statsItem.PowerW),
}
}
if statsItem.GPUUtilization == nil {
GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
} else {
GPUUtilizationStat = &device.StatValue{
Unit: GPUUtilizationUnit,
Desc: GPUUtilizationDesc,
IntNumeratorVal: int64(*statsItem.GPUUtilization),
}
}
if statsItem.MemoryUtilization == nil {
memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
} else {
memoryUtilizationStat = &device.StatValue{
Unit: MemoryUtilizationUnit,
Desc: MemoryUtilizationDesc,
IntNumeratorVal: int64(*statsItem.MemoryUtilization),
}
}
if statsItem.EncoderUtilization == nil {
encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
} else {
encoderUtilizationStat = &device.StatValue{
Unit: EncoderUtilizationUnit,
Desc: EncoderUtilizationDesc,
IntNumeratorVal: int64(*statsItem.EncoderUtilization),
}
}
if statsItem.DecoderUtilization == nil {
decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
} else {
decoderUtilizationStat = &device.StatValue{
Unit: DecoderUtilizationUnit,
Desc: DecoderUtilizationDesc,
IntNumeratorVal: int64(*statsItem.DecoderUtilization),
}
}
if statsItem.TemperatureC == nil {
temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
} else {
temperatureStat = &device.StatValue{
Unit: TemperatureUnit,
Desc: TemperatureDesc,
IntNumeratorVal: int64(*statsItem.TemperatureC),
}
}
if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
} else {
memoryStateStat = &device.StatValue{
Unit: MemoryStateUnit,
Desc: MemoryStateDesc,
IntNumeratorVal: int64(*statsItem.UsedMemoryMiB),
IntDenominatorVal: int64(*statsItem.MemoryMiB),
}
}
if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
} else {
BAR1StateStat = &device.StatValue{
Unit: BAR1StateUnit,
Desc: BAR1StateDesc,
IntNumeratorVal: int64(*statsItem.BAR1UsedMiB),
IntDenominatorVal: int64(*statsItem.BAR1MiB),
}
}
if statsItem.ECCErrorsL1Cache == nil {
ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
} else {
ECCErrorsL1CacheStat = &device.StatValue{
Unit: ECCErrorsL1CacheUnit,
Desc: ECCErrorsL1CacheDesc,
IntNumeratorVal: int64(*statsItem.ECCErrorsL1Cache),
}
}
if statsItem.ECCErrorsL2Cache == nil {
ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
} else {
ECCErrorsL2CacheStat = &device.StatValue{
Unit: ECCErrorsL2CacheUnit,
Desc: ECCErrorsL2CacheDesc,
IntNumeratorVal: int64(*statsItem.ECCErrorsL2Cache),
}
}
if statsItem.ECCErrorsDevice == nil {
ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
} else {
ECCErrorsDeviceStat = &device.StatValue{
Unit: ECCErrorsDeviceUnit,
Desc: ECCErrorsDeviceDesc,
IntNumeratorVal: int64(*statsItem.ECCErrorsDevice),
}
}
return &device.DeviceStats{
Summary: temperatureStat,
Stats: &device.StatObject{
Attributes: map[string]*device.StatValue{
PowerUsageAttr: powerUsageStat,
GPUUtilizationAttr: GPUUtilizationStat,
MemoryUtilizationAttr: memoryUtilizationStat,
EncoderUtilizationAttr: encoderUtilizationStat,
DecoderUtilizationAttr: decoderUtilizationStat,
TemperatureAttr: temperatureStat,
MemoryStateAttr: memoryStateStat,
BAR1StateAttr: BAR1StateStat,
ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat,
ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat,
ECCErrorsDeviceAttr: ECCErrorsDeviceStat,
},
},
Timestamp: timestamp,
}
}

File diff suppressed because it is too large Load Diff

160
vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA generated vendored Normal file
View File

@@ -0,0 +1,160 @@
GPU Monitoring Tools
Software Grant and Corporate Contributor License Agreement ("Agreement")
Thank you for your interest in the gpu-monitoring-tools Project (the
"Project"). In order to clarify the intellectual property license
granted with Contributions from any person or entity, NVIDIA
Corporation (the “Copyright Holders") must have a Contributor License
Agreement (CLA) on file that has been signed by each Contributor,
indicating agreement to the license terms below. This license is
for your protection as a Contributor as well as the protection of the
Project and its users; it does not change your rights to use your own
Contributions for any other purpose.
This version of the Agreement allows an entity (the "Corporation") to
submit Contributions to the Project, to authorize Contributions
submitted by its designated employees to the Project, and to grant
copyright and patent licenses thereto to the Copyright Holders.
If you have not already done so, please complete and sign, then scan and
email a pdf file of this Agreement to digits@nvidia.com.
Please read this document carefully before signing and keep a copy for
your records.
Corporation name: ________________________________________________
Corporation address: ________________________________________________
________________________________________________
________________________________________________
Point of Contact: ________________________________________________
E-Mail: ________________________________________________
Telephone: _____________________ Fax: _____________________
You accept and agree to the following terms and conditions for Your
present and future Contributions submitted to the Project. In
return, the Copyright Holders shall not use Your Contributions in a way
that is contrary to the public benefit or inconsistent with its nonprofit
status and bylaws in effect at the time of the Contribution. Except
for the license granted herein to the Copyright Holders and recipients of
software distributed by the Copyright Holders, You reserve all right, title,
and interest in and to Your Contributions.
1. Definitions.
"You" (or "Your") shall mean the copyright owner or legal entity
authorized by the copyright owner that is making this Agreement
with the Copyright Holders. For legal entities, the entity making a
Contribution and all other entities that control, are controlled by,
or are under common control with that entity are considered to be a
single Contributor. For the purposes of this definition, "control"
means (i) the power, direct or indirect, to cause the direction or
management of such entity, whether by contract or otherwise, or
(ii) ownership of fifty percent (50%) or more of the outstanding
shares, or (iii) beneficial ownership of such entity.
"Contribution" shall mean the code, documentation or other original
works of authorship expressly identified in Schedule B, as well as
any original work of authorship, including
any modifications or additions to an existing work, that is intentionally
submitted by You to the Copyright Holders for inclusion in, or
documentation of, any of the products owned or managed by the
Copyright Holders (the "Work"). For the purposes of this definition,
"submitted" means any form of electronic, verbal, or written
communication sent to the Copyright Holders or its representatives,
including but not limited to communication on electronic mailing
lists, source code control systems, and issue tracking systems
that are managed by, or on behalf of, the Copyright Holders for the
purpose of discussing and improving the Work, but excluding
communication that is conspicuously marked or otherwise designated
in writing by You as "Not a Contribution."
2. Grant of Copyright License. Subject to the terms and conditions
of this Agreement, You hereby grant to the Copyright Holders and to
recipients of software distributed by the Copyright Holders a
perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable copyright license to reproduce, prepare derivative works
of, publicly display, publicly perform, sublicense, and distribute
Your Contributions and such derivative works.
3. Grant of Patent License. Subject to the terms and conditions of
this Agreement, You hereby grant to the Copyright Holders and to
recipients of software distributed by the Copyright Holders
a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license
to make, have made, use, offer to sell, sell, import, and otherwise
transfer the Work, where such license applies only to those
patent claims licensable by You that are necessarily infringed
by Your Contribution(s) alone or by combination of Your Contribution(s)
with the Work to which such Contribution(s) were submitted.
If any entity institutes patent litigation against You or any
other entity (including a cross-claim or counterclaim in a lawsuit)
alleging that your Contribution, or the Work to which you have
contributed, constitutes direct or contributory patent infringement,
then any patent licenses granted to that entity under this Agreement
for that Contribution or Work shall terminate as of the date such
litigation is filed.
4. You represent that You are legally entitled to grant the above
license. You represent further that each employee of the
Corporation designated on Schedule A below (or in a subsequent
written modification to that Schedule) is authorized to submit
Contributions on behalf of the Corporation.
5. You represent that each of Your Contributions is Your original
creation (see section 7 for submissions on behalf of others).
6. You are not expected to provide support for Your Contributions,
except to the extent You desire to provide support. You may provide
support for free, for a fee, or not at all. Unless required by
applicable law or agreed to in writing, You provide Your
Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
OF ANY KIND, either express or implied, including, without
limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
7. Should You wish to submit work that is not Your original creation,
You may submit it to the Copyright Holders separately from any
Contribution, identifying the complete details of its source and
of any license or other restriction (including, but not limited
to, related patents, trademarks, and license agreements) of which
you are personally aware, and conspicuously marking the work as
"Submitted on behalf of a third-party: [named here]".
8. It is your responsibility to notify the Copyright Holders when any change
is required to the list of designated employees authorized to submit
Contributions on behalf of the Corporation, or to the Corporation's
Point of Contact with the Copyright Holders.
Please sign: __________________________________ Date: _______________
Title: __________________________________
Corporation: __________________________________
Schedule A
[Initial list of designated employees. NB: authorization is not
tied to particular Contributions.]
Schedule B
[Identification of optional concurrent software grant. Would be
left blank or omitted if there is no concurrent software grant.]

29
vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,29 @@
BSD 3-Clause License
Copyright (c) 2018, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,34 @@
# NVIDIA GPU Monitoring Tools
## NVML Go Bindings
[NVIDIA Management Library (NVML)](https://developer.nvidia.com/nvidia-management-library-nvml) is a C-based API for monitoring and managing NVIDIA GPU devices.
NVML go bindings are taken from [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) with some improvements and additions. NVML headers are also added to the package to make it easy to use and build.
### NVML Samples
Three [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/nvml/README.md) are included to demonstrate how to use the NVML API.
## DCGM Go Bindings
[NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting.
DCGM go bindings makes administering and monitoring containerized GPU applications easy.
### DCGM Samples
DCGM can be run in different modes, seven [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/README.md) and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) are included for showing how to use the DCGM API and run it in different modes.
## DCGM exporter
GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a simple shell script that starts nv-hostengine, reads GPU metrics every 1 second and converts it to a standard Prometheus format.
Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md).
## Issues and Contributing
A signed copy of the [Contributor License Agreement](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/CLA) needs to be provided to <a href="mailto:digits@nvidia.com">digits@nvidia.com</a> before any change can be accepted.
* Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new)
* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/)

View File

@@ -0,0 +1,634 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
package nvml
// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
// #include "nvml_dl.h"
import "C"
import (
"errors"
"fmt"
"io/ioutil"
"os"
"sort"
"strconv"
"strings"
)
const (
szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
szProcs = 32
szProcName = 64
XidCriticalError = C.nvmlEventTypeXidCriticalError
)
type handle struct{ dev C.nvmlDevice_t }
type EventSet struct{ set C.nvmlEventSet_t }
type Event struct {
UUID *string
Etype uint64
Edata uint64
}
func uintPtr(c C.uint) *uint {
i := uint(c)
return &i
}
func uint64Ptr(c C.ulonglong) *uint64 {
i := uint64(c)
return &i
}
func stringPtr(c *C.char) *string {
s := C.GoString(c)
return &s
}
func errorString(ret C.nvmlReturn_t) error {
if ret == C.NVML_SUCCESS {
return nil
}
err := C.GoString(C.nvmlErrorString(ret))
return fmt.Errorf("nvml: %v", err)
}
func init_() error {
r := C.nvmlInit_dl()
if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
return errors.New("could not load NVML library")
}
return errorString(r)
}
func NewEventSet() EventSet {
var set C.nvmlEventSet_t
C.nvmlEventSetCreate(&set)
return EventSet{set}
}
func RegisterEvent(es EventSet, event int) error {
n, err := deviceGetCount()
if err != nil {
return err
}
var i uint
for i = 0; i < n; i++ {
h, err := deviceGetHandleByIndex(i)
if err != nil {
return err
}
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
if r != C.NVML_SUCCESS {
return errorString(r)
}
}
return nil
}
func RegisterEventForDevice(es EventSet, event int, uuid string) error {
n, err := deviceGetCount()
if err != nil {
return err
}
var i uint
for i = 0; i < n; i++ {
h, err := deviceGetHandleByIndex(i)
if err != nil {
return err
}
duuid, err := h.deviceGetUUID()
if err != nil {
return err
}
if *duuid != uuid {
continue
}
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
if r != C.NVML_SUCCESS {
return errorString(r)
}
return nil
}
return fmt.Errorf("nvml: device not found")
}
func DeleteEventSet(es EventSet) {
C.nvmlEventSetFree(es.set)
}
func WaitForEvent(es EventSet, timeout uint) (Event, error) {
var data C.nvmlEventData_t
r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
uuid, _ := handle{data.device}.deviceGetUUID()
return Event{
UUID: uuid,
Etype: uint64(data.eventType),
Edata: uint64(data.eventData),
},
errorString(r)
}
func shutdown() error {
return errorString(C.nvmlShutdown_dl())
}
func systemGetDriverVersion() (string, error) {
var driver [szDriver]C.char
r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
return C.GoString(&driver[0]), errorString(r)
}
func systemGetProcessName(pid uint) (string, error) {
var proc [szProcName]C.char
r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
return C.GoString(&proc[0]), errorString(r)
}
func deviceGetCount() (uint, error) {
var n C.uint
r := C.nvmlDeviceGetCount(&n)
return uint(n), errorString(r)
}
func deviceGetHandleByIndex(idx uint) (handle, error) {
var dev C.nvmlDevice_t
r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
return handle{dev}, errorString(r)
}
func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
var level C.nvmlGpuTopologyLevel_t
r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(C.uint(level)), errorString(r)
}
func (h handle) deviceGetName() (*string, error) {
var name [szName]C.char
r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return stringPtr(&name[0]), errorString(r)
}
func (h handle) deviceGetUUID() (*string, error) {
var uuid [szUUID]C.char
r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return stringPtr(&uuid[0]), errorString(r)
}
func (h handle) deviceGetPciInfo() (*string, error) {
var pci C.nvmlPciInfo_t
r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return stringPtr(&pci.busId[0]), errorString(r)
}
func (h handle) deviceGetMinorNumber() (*uint, error) {
var minor C.uint
r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(minor), errorString(r)
}
func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
var bar1 C.nvmlBAR1Memory_t
r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
}
func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
var power C.uint
r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(power), errorString(r)
}
func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
var sm, mem C.uint
r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
}
return uintPtr(sm), uintPtr(mem), errorString(r)
}
func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
var link C.uint
r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(link), errorString(r)
}
func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
var width C.uint
r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(width), errorString(r)
}
func (h handle) deviceGetPowerUsage() (*uint, error) {
var power C.uint
r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(power), errorString(r)
}
func (h handle) deviceGetTemperature() (*uint, error) {
var temp C.uint
r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(temp), errorString(r)
}
func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
var usage C.nvmlUtilization_t
r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
}
func (h handle) deviceGetEncoderUtilization() (*uint, error) {
var usage, sampling C.uint
r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(usage), errorString(r)
}
func (h handle) deviceGetDecoderUtilization() (*uint, error) {
var usage, sampling C.uint
r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil
}
return uintPtr(usage), errorString(r)
}
func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
var mem C.nvmlMemory_t
r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
err = errorString(r)
if r != C.NVML_SUCCESS {
return
}
totalMem = uint64Ptr(mem.total)
if totalMem != nil {
*totalMem /= 1024 * 1024 // MiB
}
devMem = DeviceMemory{
Used: uint64Ptr(mem.used),
Free: uint64Ptr(mem.free),
}
if devMem.Used != nil {
*devMem.Used /= 1024 * 1024 // MiB
}
if devMem.Free != nil {
*devMem.Free /= 1024 * 1024 // MiB
}
return
}
func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
var sm, mem C.uint
r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
}
return uintPtr(sm), uintPtr(mem), errorString(r)
}
func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
var l1, l2, mem C.ulonglong
r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
}
return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
}
func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
var rx, tx C.uint
r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
if r == C.NVML_SUCCESS {
r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
}
return uintPtr(rx), uintPtr(tx), errorString(r)
}
func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
var procs [szProcs]C.nvmlProcessInfo_t
var count = C.uint(szProcs)
r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
n := int(count)
pids := make([]uint, n)
mems := make([]uint64, n)
for i := 0; i < n; i++ {
pids[i] = uint(procs[i].pid)
mems[i] = uint64(procs[i].usedGpuMemory)
}
return pids, mems, errorString(r)
}
func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
var procs [szProcs]C.nvmlProcessInfo_t
var count = C.uint(szProcs)
r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
if r == C.NVML_ERROR_NOT_SUPPORTED {
return nil, nil, nil
}
n := int(count)
pids := make([]uint, n)
mems := make([]uint64, n)
for i := 0; i < n; i++ {
pids[i] = uint(procs[i].pid)
mems[i] = uint64(procs[i].usedGpuMemory)
}
return pids, mems, errorString(r)
}
func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
if err != nil {
return nil, err
}
gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
if err != nil {
return nil, err
}
allPids := make(map[uint]ProcessInfo)
for i, pid := range cPids {
name, err := processName(pid)
if err != nil {
return nil, err
}
allPids[pid] = ProcessInfo{
PID: pid,
Name: name,
MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
Type: Compute,
}
}
for i, pid := range gPids {
pInfo, exists := allPids[pid]
if exists {
pInfo.Type = ComputeAndGraphics
allPids[pid] = pInfo
} else {
name, err := processName(pid)
if err != nil {
return nil, err
}
allPids[pid] = ProcessInfo{
PID: pid,
Name: name,
MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
Type: Graphics,
}
}
}
var processInfo []ProcessInfo
for _, v := range allPids {
processInfo = append(processInfo, v)
}
sort.Slice(processInfo, func(i, j int) bool {
return processInfo[i].PID < processInfo[j].PID
})
return processInfo, nil
}
func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
var clocksThrottleReasons C.ulonglong
r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return ThrottleReasonUnknown, nil
}
if r != C.NVML_SUCCESS {
return ThrottleReasonUnknown, errorString(r)
}
switch clocksThrottleReasons {
case C.nvmlClocksThrottleReasonGpuIdle:
reason = ThrottleReasonGpuIdle
case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
reason = ThrottleReasonApplicationsClocksSetting
case C.nvmlClocksThrottleReasonSwPowerCap:
reason = ThrottleReasonSwPowerCap
case C.nvmlClocksThrottleReasonHwSlowdown:
reason = ThrottleReasonHwSlowdown
case C.nvmlClocksThrottleReasonSyncBoost:
reason = ThrottleReasonSyncBoost
case C.nvmlClocksThrottleReasonSwThermalSlowdown:
reason = ThrottleReasonSwThermalSlowdown
case C.nvmlClocksThrottleReasonHwThermalSlowdown:
reason = ThrottleReasonHwThermalSlowdown
case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
reason = ThrottleReasonHwPowerBrakeSlowdown
case C.nvmlClocksThrottleReasonDisplayClockSetting:
reason = ThrottleReasonDisplayClockSetting
case C.nvmlClocksThrottleReasonNone:
reason = ThrottleReasonNone
}
return
}
func (h handle) getPerformanceState() (PerfState, error) {
var pstate C.nvmlPstates_t
r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return PerfStateUnknown, nil
}
if r != C.NVML_SUCCESS {
return PerfStateUnknown, errorString(r)
}
return PerfState(pstate), nil
}
func processName(pid uint) (string, error) {
f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
d, err := ioutil.ReadFile(f)
if err != nil {
// TOCTOU: process terminated
if os.IsNotExist(err) {
return "", nil
}
return "", err
}
return strings.TrimSuffix(string(d), "\n"), err
}
func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
var mode C.nvmlEnableState_t
var buffer C.uint
r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return accountingInfo, errorString(r)
}
r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return accountingInfo, errorString(r)
}
accountingInfo = Accounting{
Mode: ModeState(mode),
BufferSize: uintPtr(buffer),
}
return
}
func (h handle) getDisplayInfo() (display Display, err error) {
var mode, isActive C.nvmlEnableState_t
r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return display, errorString(r)
}
r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
if r != C.NVML_SUCCESS {
return display, errorString(r)
}
display = Display{
Mode: ModeState(mode),
Active: ModeState(isActive),
}
return
}
func (h handle) getPeristenceMode() (state ModeState, err error) {
var mode C.nvmlEnableState_t
r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
if r == C.NVML_ERROR_NOT_SUPPORTED {
return
}
return ModeState(mode), errorString(r)
}

View File

@@ -0,0 +1,533 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
package nvml
// #include "nvml_dl.h"
import "C"
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"strconv"
"strings"
)
var (
ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
ErrUnsupportedGPU = errors.New("unsupported GPU device")
)
type ModeState uint
const (
Enabled ModeState = iota
Disabled
)
func (m ModeState) String() string {
switch m {
case Enabled:
return "Enabled"
case Disabled:
return "Disabled"
}
return "N/A"
}
type Display struct {
Mode ModeState
Active ModeState
}
type Accounting struct {
Mode ModeState
BufferSize *uint
}
type DeviceMode struct {
DisplayInfo Display
Persistence ModeState
AccountingInfo Accounting
}
type ThrottleReason uint
const (
ThrottleReasonGpuIdle ThrottleReason = iota
ThrottleReasonApplicationsClocksSetting
ThrottleReasonSwPowerCap
ThrottleReasonHwSlowdown
ThrottleReasonSyncBoost
ThrottleReasonSwThermalSlowdown
ThrottleReasonHwThermalSlowdown
ThrottleReasonHwPowerBrakeSlowdown
ThrottleReasonDisplayClockSetting
ThrottleReasonNone
ThrottleReasonUnknown
)
func (r ThrottleReason) String() string {
switch r {
case ThrottleReasonGpuIdle:
return "Gpu Idle"
case ThrottleReasonApplicationsClocksSetting:
return "Applications Clocks Setting"
case ThrottleReasonSwPowerCap:
return "SW Power Cap"
case ThrottleReasonHwSlowdown:
return "HW Slowdown"
case ThrottleReasonSyncBoost:
return "Sync Boost"
case ThrottleReasonSwThermalSlowdown:
return "SW Thermal Slowdown"
case ThrottleReasonHwThermalSlowdown:
return "HW Thermal Slowdown"
case ThrottleReasonHwPowerBrakeSlowdown:
return "HW Power Brake Slowdown"
case ThrottleReasonDisplayClockSetting:
return "Display Clock Setting"
case ThrottleReasonNone:
return "No clocks throttling"
}
return "N/A"
}
type PerfState uint
const (
PerfStateMax = 0
PerfStateMin = 15
PerfStateUnknown = 32
)
func (p PerfState) String() string {
if p >= PerfStateMax && p <= PerfStateMin {
return fmt.Sprintf("P%d", p)
}
return "Unknown"
}
type ProcessType uint
const (
Compute ProcessType = iota
Graphics
ComputeAndGraphics
)
func (t ProcessType) String() string {
typ := "C+G"
if t == Compute {
typ = "C"
} else if t == Graphics {
typ = "G"
}
return typ
}
type P2PLinkType uint
const (
P2PLinkUnknown P2PLinkType = iota
P2PLinkCrossCPU
P2PLinkSameCPU
P2PLinkHostBridge
P2PLinkMultiSwitch
P2PLinkSingleSwitch
P2PLinkSameBoard
)
type P2PLink struct {
BusID string
Link P2PLinkType
}
func (t P2PLinkType) String() string {
switch t {
case P2PLinkCrossCPU:
return "Cross CPU socket"
case P2PLinkSameCPU:
return "Same CPU socket"
case P2PLinkHostBridge:
return "Host PCI bridge"
case P2PLinkMultiSwitch:
return "Multiple PCI switches"
case P2PLinkSingleSwitch:
return "Single PCI switch"
case P2PLinkSameBoard:
return "Same board"
case P2PLinkUnknown:
}
return "N/A"
}
type ClockInfo struct {
Cores *uint
Memory *uint
}
type PCIInfo struct {
BusID string
BAR1 *uint64
Bandwidth *uint
}
type Device struct {
handle
UUID string
Path string
Model *string
Power *uint
Memory *uint64
CPUAffinity *uint
PCI PCIInfo
Clocks ClockInfo
Topology []P2PLink
}
type UtilizationInfo struct {
GPU *uint
Memory *uint
Encoder *uint
Decoder *uint
}
type PCIThroughputInfo struct {
RX *uint
TX *uint
}
type PCIStatusInfo struct {
BAR1Used *uint64
Throughput PCIThroughputInfo
}
type ECCErrorsInfo struct {
L1Cache *uint64
L2Cache *uint64
Device *uint64
}
type DeviceMemory struct {
Used *uint64
Free *uint64
}
type MemoryInfo struct {
Global DeviceMemory
ECCErrors ECCErrorsInfo
}
type ProcessInfo struct {
PID uint
Name string
MemoryUsed uint64
Type ProcessType
}
type DeviceStatus struct {
Power *uint
Temperature *uint
Utilization UtilizationInfo
Memory MemoryInfo
Clocks ClockInfo
PCI PCIStatusInfo
Processes []ProcessInfo
Throttle ThrottleReason
Performance PerfState
}
func assert(err error) {
if err != nil {
panic(err)
}
}
func Init() error {
return init_()
}
func Shutdown() error {
return shutdown()
}
func GetDeviceCount() (uint, error) {
return deviceGetCount()
}
func GetDriverVersion() (string, error) {
return systemGetDriverVersion()
}
func numaNode(busid string) (uint, error) {
// discard leading zeros of busid
b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
if err != nil {
// XXX report node 0 if NUMA support isn't enabled
return 0, nil
}
node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
if err != nil {
return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
}
if node < 0 {
node = 0 // XXX report node 0 instead of NUMA_NO_NODE
}
return uint(node), nil
}
func pciBandwidth(gen, width *uint) *uint {
m := map[uint]uint{
1: 250, // MB/s
2: 500,
3: 985,
4: 1969,
}
if gen == nil || width == nil {
return nil
}
bw := m[*gen] * *width
return &bw
}
func NewDevice(idx uint) (device *Device, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
h, err := deviceGetHandleByIndex(idx)
assert(err)
model, err := h.deviceGetName()
assert(err)
uuid, err := h.deviceGetUUID()
assert(err)
minor, err := h.deviceGetMinorNumber()
assert(err)
power, err := h.deviceGetPowerManagementLimit()
assert(err)
totalMem, _, err := h.deviceGetMemoryInfo()
assert(err)
busid, err := h.deviceGetPciInfo()
assert(err)
bar1, _, err := h.deviceGetBAR1MemoryInfo()
assert(err)
pcig, err := h.deviceGetMaxPcieLinkGeneration()
assert(err)
pciw, err := h.deviceGetMaxPcieLinkWidth()
assert(err)
ccore, cmem, err := h.deviceGetMaxClockInfo()
assert(err)
if minor == nil || busid == nil || uuid == nil {
return nil, ErrUnsupportedGPU
}
path := fmt.Sprintf("/dev/nvidia%d", *minor)
node, err := numaNode(*busid)
assert(err)
device = &Device{
handle: h,
UUID: *uuid,
Path: path,
Model: model,
Power: power,
Memory: totalMem,
CPUAffinity: &node,
PCI: PCIInfo{
BusID: *busid,
BAR1: bar1,
Bandwidth: pciBandwidth(pcig, pciw), // MB/s
},
Clocks: ClockInfo{
Cores: ccore, // MHz
Memory: cmem, // MHz
},
}
if power != nil {
*device.Power /= 1000 // W
}
if bar1 != nil {
*device.PCI.BAR1 /= 1024 * 1024 // MiB
}
return
}
func NewDeviceLite(idx uint) (device *Device, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
h, err := deviceGetHandleByIndex(idx)
assert(err)
uuid, err := h.deviceGetUUID()
assert(err)
minor, err := h.deviceGetMinorNumber()
assert(err)
busid, err := h.deviceGetPciInfo()
assert(err)
if minor == nil || busid == nil || uuid == nil {
return nil, ErrUnsupportedGPU
}
path := fmt.Sprintf("/dev/nvidia%d", *minor)
device = &Device{
handle: h,
UUID: *uuid,
Path: path,
PCI: PCIInfo{
BusID: *busid,
},
}
return
}
func (d *Device) Status() (status *DeviceStatus, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
power, err := d.deviceGetPowerUsage()
assert(err)
temp, err := d.deviceGetTemperature()
assert(err)
ugpu, umem, err := d.deviceGetUtilizationRates()
assert(err)
uenc, err := d.deviceGetEncoderUtilization()
assert(err)
udec, err := d.deviceGetDecoderUtilization()
assert(err)
_, devMem, err := d.deviceGetMemoryInfo()
assert(err)
ccore, cmem, err := d.deviceGetClockInfo()
assert(err)
_, bar1, err := d.deviceGetBAR1MemoryInfo()
assert(err)
el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
assert(err)
pcirx, pcitx, err := d.deviceGetPcieThroughput()
assert(err)
throttle, err := d.getClocksThrottleReasons()
assert(err)
perfState, err := d.getPerformanceState()
assert(err)
processInfo, err := d.deviceGetAllRunningProcesses()
assert(err)
status = &DeviceStatus{
Power: power,
Temperature: temp, // °C
Utilization: UtilizationInfo{
GPU: ugpu, // %
Memory: umem, // %
Encoder: uenc, // %
Decoder: udec, // %
},
Memory: MemoryInfo{
Global: devMem,
ECCErrors: ECCErrorsInfo{
L1Cache: el1,
L2Cache: el2,
Device: emem,
},
},
Clocks: ClockInfo{
Cores: ccore, // MHz
Memory: cmem, // MHz
},
PCI: PCIStatusInfo{
BAR1Used: bar1,
Throughput: PCIThroughputInfo{
RX: pcirx,
TX: pcitx,
},
},
Throttle: throttle,
Performance: perfState,
Processes: processInfo,
}
if power != nil {
*status.Power /= 1000 // W
}
if bar1 != nil {
*status.PCI.BAR1Used /= 1024 * 1024 // MiB
}
if pcirx != nil {
*status.PCI.Throughput.RX /= 1000 // MB/s
}
if pcitx != nil {
*status.PCI.Throughput.TX /= 1000 // MB/s
}
return
}
func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
if err != nil || level == nil {
return P2PLinkUnknown, err
}
switch *level {
case C.NVML_TOPOLOGY_INTERNAL:
link = P2PLinkSameBoard
case C.NVML_TOPOLOGY_SINGLE:
link = P2PLinkSingleSwitch
case C.NVML_TOPOLOGY_MULTIPLE:
link = P2PLinkMultiSwitch
case C.NVML_TOPOLOGY_HOSTBRIDGE:
link = P2PLinkHostBridge
case C.NVML_TOPOLOGY_CPU:
link = P2PLinkSameCPU
case C.NVML_TOPOLOGY_SYSTEM:
link = P2PLinkCrossCPU
default:
err = ErrUnsupportedP2PLink
}
return
}
func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
return d.handle.deviceGetComputeRunningProcesses()
}
func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
return d.handle.deviceGetGraphicsRunningProcesses()
}
func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
return d.handle.deviceGetAllRunningProcesses()
}
func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
defer func() {
if r := recover(); r != nil {
err = r.(error)
}
}()
display, err := d.getDisplayInfo()
assert(err)
p, err := d.getPeristenceMode()
assert(err)
accounting, err := d.getAccountingInfo()
assert(err)
mode = &DeviceMode{
DisplayInfo: display,
Persistence: p,
AccountingInfo: accounting,
}
return
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,46 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#include <stddef.h>
#include <dlfcn.h>
#include "nvml_dl.h"
#define DLSYM(x, sym) \
do { \
dlerror(); \
x = dlsym(handle, #sym); \
if (dlerror() != NULL) { \
return (NVML_ERROR_FUNCTION_NOT_FOUND); \
} \
} while (0)
typedef nvmlReturn_t (*nvmlSym_t)();
static void *handle;
nvmlReturn_t NVML_DL(nvmlInit)(void)
{
handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
if (handle == NULL) {
return (NVML_ERROR_LIBRARY_NOT_FOUND);
}
return (nvmlInit());
}
nvmlReturn_t NVML_DL(nvmlShutdown)(void)
{
nvmlReturn_t r = nvmlShutdown();
if (r != NVML_SUCCESS) {
return (r);
}
return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
}
nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
{
nvmlSym_t sym;
DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
return ((*sym)(dev1, dev2, info));
}

View File

@@ -0,0 +1,15 @@
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
#ifndef _NVML_DL_H_
#define _NVML_DL_H_
#include "nvml.h"
#define NVML_DL(x) x##_dl
extern nvmlReturn_t NVML_DL(nvmlInit)(void);
extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
#endif // _NVML_DL_H_

2
vendor/vendor.json vendored
View File

@@ -9,6 +9,8 @@
{"path":"github.com/Azure/go-ansiterm/winterm","checksumSHA1":"jBimnggjIiFUjaImNoJhSVLtdzw=","revision":"fa152c58bc15761d0200cb75fe958b89a9d4888e","revisionTime":"2016-06-22T17:32:16Z"},
{"path":"github.com/DataDog/datadog-go/statsd","checksumSHA1":"WvApwvvSe3i/3KO8300dyeFmkbI=","revision":"b10af4b12965a1ad08d164f57d14195b4140d8de","revisionTime":"2017-08-09T10:47:06Z"},
{"path":"github.com/Microsoft/go-winio","checksumSHA1":"AzjRkOQtVBTwIw4RJLTygFhJs3s=","revision":"f533f7a102197536779ea3a8cb881d639e21ec5a","revisionTime":"2017-05-24T00:36:31Z"},
{"path":"github.com/NVIDIA/gpu-monitoring-tools","checksumSHA1":"kF1vk+8Xvb3nGBiw9+qbUc0SZ4M=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
{"path":"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml","checksumSHA1":"P8FATSSgpe5A17FyPrGpsX95Xw8=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
{"path":"github.com/NYTimes/gziphandler","checksumSHA1":"jktW57+vJsziNVPeXMCoujTzdW4=","revision":"97ae7fbaf81620fe97840685304a78a306a39c64","revisionTime":"2017-09-16T00:36:49Z"},
{"path":"github.com/Nvveen/Gotty","checksumSHA1":"Aqy8/FoAIidY/DeQ5oTYSZ4YFVc=","revision":"cd527374f1e5bff4938207604a14f2e38a9cf512","revisionTime":"2012-06-04T00:48:16Z"},
{"path":"github.com/RackSec/srslog","checksumSHA1":"OTN4c1F0p+mEG2CpkU1Kuavupf0=","revision":"259aed10dfa74ea2961eddd1d9847619f6e98837","revisionTime":"2016-01-20T22:33:50Z"},