diff --git a/helper/funcs.go b/helper/funcs.go
index 49b300c24..4d8f53b1a 100644
--- a/helper/funcs.go
+++ b/helper/funcs.go
@@ -57,11 +57,16 @@ func Int64ToPtr(i int64) *int64 {
return &i
}
-// UintToPtr returns the pointer to an uint
+// Uint64ToPtr returns the pointer to an uint64
func Uint64ToPtr(u uint64) *uint64 {
return &u
}
+// UintToPtr returns the pointer to an uint
+func UintToPtr(u uint) *uint {
+ return &u
+}
+
// StringToPtr returns the pointer to a string
func StringToPtr(str string) *string {
return &str
diff --git a/plugins/device/cmd/nvidia/README.md b/plugins/device/cmd/nvidia/README.md
new file mode 100644
index 000000000..55d1e7f13
--- /dev/null
+++ b/plugins/device/cmd/nvidia/README.md
@@ -0,0 +1,23 @@
+This package provides an implementation of nvidia device plugin
+
+# Behavior
+
+Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period.
+
+# Config
+
+The configuration should be passed via an HCL file that begins with a top level `config` stanza:
+
+```
+config {
+ ignored_gpu_ids = ["uuid1", "uuid2"]
+ fingerprint_period = "5s"
+ stats_period = "5s"
+}
+```
+
+The valid configuration options are:
+
+* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
+* `fingerprint_period` (`string`: `"5s"`): The interval to repeat fingerprint process to identify possible changes.
+* `stats_period` (`string`: `"5s"`): The interval at which to emit statistics about the devices.
diff --git a/plugins/device/cmd/nvidia/cmd/main.go b/plugins/device/cmd/nvidia/cmd/main.go
new file mode 100644
index 000000000..1b5b0c41c
--- /dev/null
+++ b/plugins/device/cmd/nvidia/cmd/main.go
@@ -0,0 +1,18 @@
+package main
+
+import (
+ log "github.com/hashicorp/go-hclog"
+
+ "github.com/hashicorp/nomad/plugins"
+ "github.com/hashicorp/nomad/plugins/device/cmd/nvidia"
+)
+
+func main() {
+ // Serve the plugin
+ plugins.Serve(factory)
+}
+
+// factory returns a new instance of the Nvidia GPU plugin
+func factory(log log.Logger) interface{} {
+ return nvidia.NewNvidiaDevice(log)
+}
diff --git a/plugins/device/cmd/nvidia/device.go b/plugins/device/cmd/nvidia/device.go
new file mode 100644
index 000000000..2613d8e77
--- /dev/null
+++ b/plugins/device/cmd/nvidia/device.go
@@ -0,0 +1,209 @@
+package nvidia
+
+import (
+ "context"
+ "fmt"
+ "strings"
+ "sync"
+ "time"
+
+ log "github.com/hashicorp/go-hclog"
+
+ "github.com/hashicorp/nomad/plugins/base"
+ "github.com/hashicorp/nomad/plugins/device"
+ "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+ "github.com/hashicorp/nomad/plugins/shared/hclspec"
+)
+
+const (
+ // pluginName is the name of the plugin
+ pluginName = "nvidia-gpu"
+
+ // vendor is the vendor providing the devices
+ vendor = "nvidia"
+
+ // deviceType is the type of device being returned
+ deviceType = device.DeviceTypeGPU
+
+ // notAvailable value is returned to nomad server in case some properties were
+ // undetected by nvml driver
+ notAvailable = "N/A"
+)
+
+const (
+ // Nvidia-container-runtime environment variable names
+ nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
+)
+
+var (
+ // pluginInfo describes the plugin
+ pluginInfo = &base.PluginInfoResponse{
+ Type: base.PluginTypeDevice,
+ PluginApiVersion: "0.0.1", // XXX This should be an array and should be consts
+ PluginVersion: "0.1.0",
+ Name: pluginName,
+ }
+
+ // configSpec is the specification of the plugin's configuration
+ configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
+ "ignored_gpu_ids": hclspec.NewDefault(
+ hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
+ hclspec.NewLiteral("[]"),
+ ),
+ "fingerprint_period": hclspec.NewDefault(
+ hclspec.NewAttr("fingerprint_period", "string", false),
+ hclspec.NewLiteral("\"5s\""),
+ ),
+ "stats_period": hclspec.NewDefault(
+ hclspec.NewAttr("stats_period", "string", false),
+ hclspec.NewLiteral("\"5s\""),
+ ),
+ })
+)
+
+// Config contains configuration information for the plugin.
+type Config struct {
+ IgnoredGPUIDs []string `codec:"ignored_gpu_ids"`
+ FingerprintPeriod string `codec:"fingerprint_period"`
+ StatsPeriod string `codec:"stats_period"`
+}
+
+// NvidiaDevice contains all plugin specific data
+type NvidiaDevice struct {
+ // nvmlClient is used to get data from nvidia
+ nvmlClient nvml.NvmlClient
+
+ // nvmlClientInitializationError holds an error retrieved during
+ // nvmlClient initialization
+ nvmlClientInitializationError error
+
+ // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
+ ignoredGPUIDs map[string]struct{}
+
+ // fingerprintPeriod is how often we should call nvml to get list of devices
+ fingerprintPeriod time.Duration
+
+ // statsPeriod is how often we should collect statistics for fingerprinted
+ // devices.
+ statsPeriod time.Duration
+
+ // devices is the set of detected eligible devices
+ devices map[string]struct{}
+ deviceLock sync.RWMutex
+
+ logger log.Logger
+}
+
+// NewNvidiaDevice returns a new nvidia device plugin.
+func NewNvidiaDevice(log log.Logger) *NvidiaDevice {
+ nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient()
+ logger := log.Named(pluginName)
+ if nvmlClientInitializationError != nil {
+ logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError)
+ }
+ return &NvidiaDevice{
+ logger: logger,
+ devices: make(map[string]struct{}),
+ ignoredGPUIDs: make(map[string]struct{}),
+ nvmlClient: nvmlClient,
+ nvmlClientInitializationError: nvmlClientInitializationError,
+ }
+}
+
+// PluginInfo returns information describing the plugin.
+func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
+ return pluginInfo, nil
+}
+
+// ConfigSchema returns the plugins configuration schema.
+func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
+ return configSpec, nil
+}
+
+// SetConfig is used to set the configuration of the plugin.
+func (d *NvidiaDevice) SetConfig(data []byte) error {
+ var config Config
+ if err := base.MsgPackDecode(data, &config); err != nil {
+ return err
+ }
+
+ for _, ignoredGPUId := range config.IgnoredGPUIDs {
+ d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
+ }
+
+ period, err := time.ParseDuration(config.FingerprintPeriod)
+ if err != nil {
+ return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
+ }
+ d.fingerprintPeriod = period
+
+ // Convert the stats period
+ speriod, err := time.ParseDuration(config.StatsPeriod)
+ if err != nil {
+ return fmt.Errorf("failed to parse stats period %q: %v", config.StatsPeriod, err)
+ }
+ d.statsPeriod = speriod
+
+ return nil
+}
+
+// Fingerprint streams detected devices. If device changes are detected or the
+// devices health changes, messages will be emitted.
+func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
+ outCh := make(chan *device.FingerprintResponse)
+ go d.fingerprint(ctx, outCh)
+ return outCh, nil
+}
+
+type reservationError struct {
+ notExistingIDs []string
+}
+
+func (e *reservationError) Error() string {
+ return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
+}
+
+// Reserve returns information on how to mount given devices.
+// Assumption is made that nomad server is responsible for correctness of
+// GPU allocations, handling tricky cases such as double-allocation of single GPU
+func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
+ if len(deviceIDs) == 0 {
+ return &device.ContainerReservation{}, nil
+ }
+ // Due to the asynchronous nature of NvidiaPlugin, there is a possibility
+ // of race condition
+ //
+ // Timeline:
+ // 1 - fingerprint reports that GPU with id "1" is present
+ // 2 - the following events happen at the same time:
+ // a) server decides to allocate GPU with id "1"
+ // b) fingerprint check reports that GPU with id "1" is no more present
+ //
+ // The latest and always valid version of fingerprinted ids are stored in
+ // d.devices map. To avoid this race condition an error is returned if
+ // any of provided deviceIDs is not found in d.devices map
+ d.deviceLock.RLock()
+ var notExistingIDs []string
+ for _, id := range deviceIDs {
+ if _, deviceIDExists := d.devices[id]; !deviceIDExists {
+ notExistingIDs = append(notExistingIDs, id)
+ }
+ }
+ d.deviceLock.RUnlock()
+ if len(notExistingIDs) != 0 {
+ return nil, &reservationError{notExistingIDs}
+ }
+
+ return &device.ContainerReservation{
+ Envs: map[string]string{
+ nvidiaVisibleDevices: strings.Join(deviceIDs, ","),
+ },
+ }, nil
+}
+
+// Stats streams statistics for the detected devices.
+func (d *NvidiaDevice) Stats(ctx context.Context) (<-chan *device.StatsResponse, error) {
+ outCh := make(chan *device.StatsResponse)
+ go d.stats(ctx, outCh)
+ return outCh, nil
+}
diff --git a/plugins/device/cmd/nvidia/device_test.go b/plugins/device/cmd/nvidia/device_test.go
new file mode 100644
index 000000000..b1fa4b17a
--- /dev/null
+++ b/plugins/device/cmd/nvidia/device_test.go
@@ -0,0 +1,115 @@
+package nvidia
+
+import (
+ "testing"
+
+ "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+
+ hclog "github.com/hashicorp/go-hclog"
+ "github.com/hashicorp/nomad/plugins/device"
+ "github.com/stretchr/testify/require"
+)
+
+type MockNvmlClient struct {
+ FingerprintError error
+ FingerprintResponseReturned *nvml.FingerprintData
+
+ StatsError error
+ StatsResponseReturned []*nvml.StatsData
+}
+
+func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) {
+ return c.FingerprintResponseReturned, c.FingerprintError
+}
+
+func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
+ return c.StatsResponseReturned, c.StatsError
+}
+
+func TestReserve(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ ExpectedReservation *device.ContainerReservation
+ ExpectedError error
+ Device *NvidiaDevice
+ RequestedIDs []string
+ }{
+ {
+ Name: "All RequestedIDs are not managed by Device",
+ ExpectedReservation: nil,
+ ExpectedError: &reservationError{[]string{
+ "UUID1",
+ "UUID2",
+ "UUID3",
+ }},
+ RequestedIDs: []string{
+ "UUID1",
+ "UUID2",
+ "UUID3",
+ },
+ Device: &NvidiaDevice{
+ logger: hclog.NewNullLogger(),
+ },
+ },
+ {
+ Name: "Some RequestedIDs are not managed by Device",
+ ExpectedReservation: nil,
+ ExpectedError: &reservationError{[]string{
+ "UUID1",
+ "UUID2",
+ }},
+ RequestedIDs: []string{
+ "UUID1",
+ "UUID2",
+ "UUID3",
+ },
+ Device: &NvidiaDevice{
+ devices: map[string]struct{}{
+ "UUID3": {},
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ },
+ {
+ Name: "All RequestedIDs are managed by Device",
+ ExpectedReservation: &device.ContainerReservation{
+ Envs: map[string]string{
+ nvidiaVisibleDevices: "UUID1,UUID2,UUID3",
+ },
+ },
+ ExpectedError: nil,
+ RequestedIDs: []string{
+ "UUID1",
+ "UUID2",
+ "UUID3",
+ },
+ Device: &NvidiaDevice{
+ devices: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ },
+ {
+ Name: "No IDs requested",
+ ExpectedReservation: &device.ContainerReservation{},
+ ExpectedError: nil,
+ RequestedIDs: nil,
+ Device: &NvidiaDevice{
+ devices: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ },
+ } {
+ actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs)
+ req := require.New(t)
+ req.Equal(testCase.ExpectedReservation, actualReservation)
+ req.Equal(testCase.ExpectedError, actualError)
+ }
+}
diff --git a/plugins/device/cmd/nvidia/fingerprint.go b/plugins/device/cmd/nvidia/fingerprint.go
new file mode 100644
index 000000000..171b01844
--- /dev/null
+++ b/plugins/device/cmd/nvidia/fingerprint.go
@@ -0,0 +1,235 @@
+package nvidia
+
+import (
+ "context"
+ "fmt"
+ "time"
+
+ "github.com/hashicorp/nomad/plugins/device"
+ "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+)
+
+const (
+ // Attribute names for reporting Fingerprint output
+ MemoryMiBAttr = "memory_mib"
+ PowerWAttr = "power_w"
+ BAR1MiBAttr = "bar1_mib"
+ DriverVersionAttr = "driver_version"
+ CoresClockMHzAttr = "cores_clock_mhz"
+ MemoryClockMHzAttr = "memory_clock_mhz"
+ PCIBandwidthMBPerSAttr = "pci_bandwidth_mb/s"
+ DisplayStateAttr = "display_state"
+ PersistenceModeAttr = "persistence_mode"
+)
+
+// fingerprint is the long running goroutine that detects hardware
+func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
+ defer close(devices)
+
+ if d.nvmlClientInitializationError != nil {
+ d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError)
+ // write empty fingerprint response to let server know that there are
+ // no working Nvidia GPU units
+ devices <- device.NewFingerprint()
+ return
+ }
+
+ // Create a timer that will fire immediately for the first detection
+ ticker := time.NewTimer(0)
+
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ ticker.Reset(d.fingerprintPeriod)
+ }
+ d.writeFingerprintToChannel(devices)
+ }
+}
+
+// writeFingerprintToChannel makes nvml call and writes response to channel
+func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
+ fingerprintData, err := d.nvmlClient.GetFingerprintData()
+
+ if err != nil {
+ d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
+ devices <- device.NewFingerprintError(err)
+ return
+ }
+
+ // ignore devices from fingerprint output
+ fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
+ // check if any device health was updated or any device was added to host
+ if !d.fingerprintChanged(fingerprintDevices) {
+ return
+ }
+
+ commonAttributes := map[string]string{
+ DriverVersionAttr: fingerprintData.DriverVersion,
+ }
+
+ // Group all FingerprintDevices by DeviceName attribute
+ deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
+ for _, device := range fingerprintDevices {
+ deviceName := device.DeviceName
+ if deviceName == nil {
+ // nvml driver was not able to detect device name. This kind
+ // of devices are placed to single group with 'notAvailable' name
+ notAvailableCopy := notAvailable
+ deviceName = ¬AvailableCopy
+ }
+
+ deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
+ }
+
+ // Build Fingerprint response with computed groups and send it over the channel
+ deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
+ for groupName, devices := range deviceListByDeviceName {
+ deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
+ }
+ devices <- device.NewFingerprint(deviceGroups...)
+}
+
+// ignoreFingerprintedDevices excludes ignored devices from fingerprint output
+func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
+ var result []*nvml.FingerprintDeviceData
+ for _, fingerprintDevice := range deviceData {
+ if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
+ result = append(result, fingerprintDevice)
+ }
+ }
+ return result
+}
+
+// fingerprintChanged checks if there are any previously unseen nvidia devices located
+// or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
+// Also, this func updates device map on NvidiaDevice with the latest data
+func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
+ d.deviceLock.Lock()
+ defer d.deviceLock.Unlock()
+
+ changeDetected := false
+ // check if every device in allDevices is in d.devices
+ for _, device := range allDevices {
+ if _, ok := d.devices[device.UUID]; !ok {
+ changeDetected = true
+ }
+ }
+
+ // check if every device in d.devices is in allDevices
+ fingerprintDeviceMap := make(map[string]struct{})
+ for _, device := range allDevices {
+ fingerprintDeviceMap[device.UUID] = struct{}{}
+ }
+ for id := range d.devices {
+ if _, ok := fingerprintDeviceMap[id]; !ok {
+ changeDetected = true
+ }
+ }
+
+ d.devices = fingerprintDeviceMap
+ return changeDetected
+}
+
+// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
+func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]string) *device.DeviceGroup {
+ // deviceGroup without devices makes no sense -> return nil when no devices are provided
+ if len(deviceList) == 0 {
+ return nil
+ }
+
+ devices := make([]*device.Device, len(deviceList))
+ for index, dev := range deviceList {
+ devices[index] = &device.Device{
+ ID: dev.UUID,
+ // all fingerprinted devices are "healthy" for now
+ // to get real health data -> dcgm bindings should be used
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: dev.PCIBusID,
+ },
+ }
+ }
+
+ deviceGroup := &device.DeviceGroup{
+ Vendor: vendor,
+ Type: deviceType,
+ Name: groupName,
+ Devices: devices,
+ // Assumption made that devices with the same DeviceName have the same
+ // attributes like amount of memory, power, bar1memory etc
+ Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
+ }
+
+ // Extend attribute map with common attributes
+ for attributeKey, attributeValue := range commonAttributes {
+ deviceGroup.Attributes[attributeKey] = attributeValue
+ }
+
+ return deviceGroup
+}
+
+// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
+// struct to device.DeviceGroup.Attributes format (map[string]string)
+// this function performs all nil checks for FingerprintDeviceData pointers
+func attributesFromFingerprintDeviceData(fingerprintDeviceData *nvml.FingerprintDeviceData) map[string]string {
+ // The following fields in FingerprintDeviceData are pointers, so they can be nil
+ // In case they are nil -> return 'notAvailable' constant instead
+ var (
+ MemoryMiB string
+ PowerW string
+ BAR1MiB string
+ CoresClockMHz string
+ MemoryClockMHz string
+ PCIBandwidthMBPerS string
+ )
+
+ if fingerprintDeviceData.MemoryMiB == nil {
+ MemoryMiB = notAvailable
+ } else {
+ MemoryMiB = fmt.Sprint(*fingerprintDeviceData.MemoryMiB)
+ }
+
+ if fingerprintDeviceData.PowerW == nil {
+ PowerW = notAvailable
+ } else {
+ PowerW = fmt.Sprint(*fingerprintDeviceData.PowerW)
+ }
+
+ if fingerprintDeviceData.BAR1MiB == nil {
+ BAR1MiB = notAvailable
+ } else {
+ BAR1MiB = fmt.Sprint(*fingerprintDeviceData.BAR1MiB)
+ }
+
+ if fingerprintDeviceData.CoresClockMHz == nil {
+ CoresClockMHz = notAvailable
+ } else {
+ CoresClockMHz = fmt.Sprint(*fingerprintDeviceData.CoresClockMHz)
+ }
+
+ if fingerprintDeviceData.MemoryClockMHz == nil {
+ MemoryClockMHz = notAvailable
+ } else {
+ MemoryClockMHz = fmt.Sprint(*fingerprintDeviceData.MemoryClockMHz)
+ }
+
+ if fingerprintDeviceData.PCIBandwidthMBPerS == nil {
+ PCIBandwidthMBPerS = notAvailable
+ } else {
+ PCIBandwidthMBPerS = fmt.Sprint(*fingerprintDeviceData.PCIBandwidthMBPerS)
+ }
+
+ return map[string]string{
+ DisplayStateAttr: fingerprintDeviceData.DisplayState,
+ PersistenceModeAttr: fingerprintDeviceData.PersistenceMode,
+ MemoryMiBAttr: MemoryMiB,
+ PowerWAttr: PowerW,
+ BAR1MiBAttr: BAR1MiB,
+ CoresClockMHzAttr: CoresClockMHz,
+ MemoryClockMHzAttr: MemoryClockMHz,
+ PCIBandwidthMBPerSAttr: PCIBandwidthMBPerS,
+ }
+
+}
diff --git a/plugins/device/cmd/nvidia/fingerprint_test.go b/plugins/device/cmd/nvidia/fingerprint_test.go
new file mode 100644
index 000000000..b181ebd65
--- /dev/null
+++ b/plugins/device/cmd/nvidia/fingerprint_test.go
@@ -0,0 +1,1243 @@
+package nvidia
+
+import (
+ "context"
+ "errors"
+ "sort"
+ "testing"
+
+ hclog "github.com/hashicorp/go-hclog"
+ "github.com/hashicorp/nomad/helper"
+ "github.com/hashicorp/nomad/plugins/device"
+ "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+ "github.com/stretchr/testify/require"
+)
+
+func TestIgnoreFingerprintedDevices(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ DeviceData []*nvml.FingerprintDeviceData
+ IgnoredGPUIds map[string]struct{}
+ ExpectedResult []*nvml.FingerprintDeviceData
+ }{
+ {
+ Name: "Odd ignored",
+ DeviceData: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ UUID: "UUID3",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ },
+ IgnoredGPUIds: map[string]struct{}{
+ "UUID2": {},
+ },
+ ExpectedResult: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ UUID: "UUID3",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ },
+ },
+ {
+ Name: "Even ignored",
+ DeviceData: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ UUID: "UUID3",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ },
+ IgnoredGPUIds: map[string]struct{}{
+ "UUID1": {},
+ "UUID3": {},
+ },
+ ExpectedResult: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ },
+ },
+ {
+ Name: "All ignored",
+ DeviceData: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ UUID: "UUID3",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ },
+ IgnoredGPUIds: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ ExpectedResult: nil,
+ },
+ {
+ Name: "No ignored",
+ DeviceData: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ UUID: "UUID3",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ },
+ IgnoredGPUIds: map[string]struct{}{},
+ ExpectedResult: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ UUID: "UUID3",
+ MemoryMiB: helper.Uint64ToPtr(1000),
+ },
+ },
+ },
+ },
+ {
+ Name: "No DeviceData provided",
+ DeviceData: nil,
+ IgnoredGPUIds: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ ExpectedResult: nil,
+ },
+ } {
+ actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds)
+ require.New(t).Equal(testCase.ExpectedResult, actualResult)
+ }
+}
+
+func TestCheckFingerprintUpdates(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ Device *NvidiaDevice
+ AllDevices []*nvml.FingerprintDeviceData
+ DeviceMapAfterMethodCall map[string]struct{}
+ ExpectedResult bool
+ }{
+ {
+ Name: "No updates",
+ Device: &NvidiaDevice{devices: map[string]struct{}{
+ "1": {},
+ "2": {},
+ "3": {},
+ }},
+ AllDevices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "3",
+ },
+ },
+ },
+ ExpectedResult: false,
+ DeviceMapAfterMethodCall: map[string]struct{}{
+ "1": {},
+ "2": {},
+ "3": {},
+ },
+ },
+ {
+ Name: "New Device Appeared",
+ Device: &NvidiaDevice{devices: map[string]struct{}{
+ "1": {},
+ "2": {},
+ "3": {},
+ }},
+ AllDevices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "3",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "I am new",
+ },
+ },
+ },
+ ExpectedResult: true,
+ DeviceMapAfterMethodCall: map[string]struct{}{
+ "1": {},
+ "2": {},
+ "3": {},
+ "I am new": {},
+ },
+ },
+ {
+ Name: "Device disappeared",
+ Device: &NvidiaDevice{devices: map[string]struct{}{
+ "1": {},
+ "2": {},
+ "3": {},
+ }},
+ AllDevices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ },
+ },
+ },
+ ExpectedResult: true,
+ DeviceMapAfterMethodCall: map[string]struct{}{
+ "1": {},
+ "2": {},
+ },
+ },
+ {
+ Name: "No devices in NvidiaDevice map",
+ Device: &NvidiaDevice{},
+ AllDevices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "3",
+ },
+ },
+ },
+ ExpectedResult: true,
+ DeviceMapAfterMethodCall: map[string]struct{}{
+ "1": {},
+ "2": {},
+ "3": {},
+ },
+ },
+ {
+ Name: "No devices detected",
+ Device: &NvidiaDevice{devices: map[string]struct{}{
+ "1": {},
+ "2": {},
+ "3": {},
+ }},
+ AllDevices: nil,
+ ExpectedResult: true,
+ DeviceMapAfterMethodCall: map[string]struct{}{},
+ },
+ } {
+ actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices)
+ req := require.New(t)
+ // check that function returns valid "updated / not updated" state
+ req.Equal(testCase.ExpectedResult, actualResult)
+ // check that function propely updates devices map
+ req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall)
+ }
+}
+
+func TestAttributesFromFingerprintDeviceData(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ FingerprintDeviceData *nvml.FingerprintDeviceData
+ ExpectedResult map[string]string
+ }{
+ {
+ Name: "All attributes are not nil",
+ FingerprintDeviceData: &nvml.FingerprintDeviceData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(256),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ ExpectedResult: map[string]string{
+ MemoryMiBAttr: "256",
+ PowerWAttr: "2",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Name: "MemoryMiB is nil and has to be replaced to N/A",
+ FingerprintDeviceData: &nvml.FingerprintDeviceData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: nil,
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ ExpectedResult: map[string]string{
+ MemoryMiBAttr: notAvailable,
+ PowerWAttr: "2",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Name: "PowerW is nil and has to be replaced to N/A",
+ FingerprintDeviceData: &nvml.FingerprintDeviceData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(256),
+ PowerW: nil,
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ ExpectedResult: map[string]string{
+ MemoryMiBAttr: "256",
+ PowerWAttr: notAvailable,
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Name: "BAR1MiB is nil and has to be replaced to N/A",
+ FingerprintDeviceData: &nvml.FingerprintDeviceData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(256),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: nil,
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ ExpectedResult: map[string]string{
+ MemoryMiBAttr: "256",
+ PowerWAttr: "2",
+ BAR1MiBAttr: notAvailable,
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Name: "PCIBandwidthMBPerS is nil and has to be replaced to N/A",
+ FingerprintDeviceData: &nvml.FingerprintDeviceData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(256),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: nil,
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ ExpectedResult: map[string]string{
+ MemoryMiBAttr: "256",
+ PowerWAttr: "2",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: notAvailable,
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Name: "CoresClockMHz is nil and has to be replaced to N/A",
+ FingerprintDeviceData: &nvml.FingerprintDeviceData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(256),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: nil,
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ ExpectedResult: map[string]string{
+ MemoryMiBAttr: "256",
+ PowerWAttr: "2",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: notAvailable,
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Name: "MemoryClockMHz is nil and has to be replaced to N/A",
+ FingerprintDeviceData: &nvml.FingerprintDeviceData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(256),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: nil,
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ ExpectedResult: map[string]string{
+ MemoryMiBAttr: "256",
+ PowerWAttr: "2",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: notAvailable,
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ } {
+ actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData)
+ require.New(t).Equal(testCase.ExpectedResult, actualResult)
+ }
+}
+
+func TestDeviceGroupFromFingerprintData(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ GroupName string
+ Devices []*nvml.FingerprintDeviceData
+ CommonAttributes map[string]string
+ ExpectedResult *device.DeviceGroup
+ }{
+ {
+ Name: "Devices are provided",
+ GroupName: "Type1",
+ Devices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(100),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(100),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID2",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ ExpectedResult: &device.DeviceGroup{
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Type1",
+ Devices: []*device.Device{
+ {
+ ID: "1",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID1",
+ },
+ },
+ {
+ ID: "2",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID2",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "100",
+ PowerWAttr: "2",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ },
+ {
+ Name: "Devices and common attributes are provided",
+ GroupName: "Type1",
+ Devices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(100),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ DeviceName: helper.StringToPtr("Type1"),
+ MemoryMiB: helper.Uint64ToPtr(100),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID2",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ CommonAttributes: map[string]string{
+ DriverVersionAttr: "1",
+ },
+ ExpectedResult: &device.DeviceGroup{
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Type1",
+ Devices: []*device.Device{
+ {
+ ID: "1",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID1",
+ },
+ },
+ {
+ ID: "2",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID2",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "100",
+ PowerWAttr: "2",
+ BAR1MiBAttr: "256",
+ DriverVersionAttr: "1",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ },
+ {
+ Name: "Devices are not provided",
+ GroupName: "Type1",
+ CommonAttributes: map[string]string{
+ DriverVersionAttr: "1",
+ },
+ Devices: nil,
+ ExpectedResult: nil,
+ },
+ } {
+ actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes)
+ require.New(t).Equal(testCase.ExpectedResult, actualResult)
+ }
+}
+
+func TestWriteFingerprintToChannel(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ Device *NvidiaDevice
+ ExpectedWriteToChannel *device.FingerprintResponse
+ }{
+ {
+ Name: "Check that FingerprintError is handled properly",
+ Device: &NvidiaDevice{
+ nvmlClient: &MockNvmlClient{
+ FingerprintError: errors.New(""),
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.FingerprintResponse{
+ Error: errors.New(""),
+ },
+ },
+ {
+ Name: "Check ignore devices works correctly",
+ Device: &NvidiaDevice{
+ nvmlClient: &MockNvmlClient{
+ FingerprintResponseReturned: &nvml.FingerprintData{
+ DriverVersion: "1",
+ Devices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Name"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ DeviceName: helper.StringToPtr("Name"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID2",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ },
+ },
+ ignoredGPUIDs: map[string]struct{}{
+ "1": {},
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.FingerprintResponse{
+ Devices: []*device.DeviceGroup{
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Name",
+ Devices: []*device.Device{
+ {
+ ID: "2",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID2",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "10",
+ PowerWAttr: "100",
+ BAR1MiBAttr: "256",
+ DriverVersionAttr: "1",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ },
+ },
+ },
+ {
+ Name: "Check devices are split to multiple device groups 1",
+ Device: &NvidiaDevice{
+ nvmlClient: &MockNvmlClient{
+ FingerprintResponseReturned: &nvml.FingerprintData{
+ DriverVersion: "1",
+ Devices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ DeviceName: helper.StringToPtr("Name2"),
+ MemoryMiB: helper.Uint64ToPtr(11),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID2",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "3",
+ DeviceName: helper.StringToPtr("Name3"),
+ MemoryMiB: helper.Uint64ToPtr(12),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID3",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ },
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.FingerprintResponse{
+ Devices: []*device.DeviceGroup{
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Name1",
+ Devices: []*device.Device{
+ {
+ ID: "1",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID1",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "10",
+ DriverVersionAttr: "1",
+ PowerWAttr: "100",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Name2",
+ Devices: []*device.Device{
+ {
+ ID: "2",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID2",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "11",
+ DriverVersionAttr: "1",
+ PowerWAttr: "100",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Name3",
+ Devices: []*device.Device{
+ {
+ ID: "3",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID3",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "12",
+ DriverVersionAttr: "1",
+ PowerWAttr: "100",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ },
+ },
+ },
+ {
+ Name: "Check devices are split to multiple device groups 2",
+ Device: &NvidiaDevice{
+ nvmlClient: &MockNvmlClient{
+ FingerprintResponseReturned: &nvml.FingerprintData{
+ DriverVersion: "1",
+ Devices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ DeviceName: helper.StringToPtr("Name2"),
+ MemoryMiB: helper.Uint64ToPtr(11),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID2",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "3",
+ DeviceName: helper.StringToPtr("Name2"),
+ MemoryMiB: helper.Uint64ToPtr(12),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID3",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ },
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.FingerprintResponse{
+ Devices: []*device.DeviceGroup{
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Name1",
+ Devices: []*device.Device{
+ {
+ ID: "1",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID1",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "10",
+ DriverVersionAttr: "1",
+ PowerWAttr: "100",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Name2",
+ Devices: []*device.Device{
+ {
+ ID: "2",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID2",
+ },
+ },
+ {
+ ID: "3",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID3",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "11",
+ DriverVersionAttr: "1",
+ PowerWAttr: "100",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ },
+ },
+ },
+ } {
+ channel := make(chan *device.FingerprintResponse, 1)
+ testCase.Device.writeFingerprintToChannel(channel)
+ actualResult := <-channel
+ // writeFingerprintToChannel iterates over map keys
+ // and insterts results to an array, so order of elements in output array
+ // may be different
+ // actualResult, expectedResult arrays has to be sorted firsted
+ sort.Slice(actualResult.Devices, func(i, j int) bool {
+ return actualResult.Devices[i].Name < actualResult.Devices[j].Name
+ })
+ sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool {
+ return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name
+ })
+ require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult)
+ }
+}
+
+// Test if nonworking driver returns empty fingerprint data
+func TestFingerprint(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ Device *NvidiaDevice
+ ExpectedWriteToChannel *device.FingerprintResponse
+ }{
+ {
+ Name: "Check that working driver returns valid fingeprint data",
+ Device: &NvidiaDevice{
+ nvmlClientInitializationError: nil,
+ nvmlClient: &MockNvmlClient{
+ FingerprintResponseReturned: &nvml.FingerprintData{
+ DriverVersion: "1",
+ Devices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID1",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID2",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "3",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PCIBusID: "pciBusID3",
+ PCIBandwidthMBPerS: helper.UintToPtr(1),
+ CoresClockMHz: helper.UintToPtr(1),
+ MemoryClockMHz: helper.UintToPtr(1),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ },
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.FingerprintResponse{
+ Devices: []*device.DeviceGroup{
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "Name1",
+ Devices: []*device.Device{
+ {
+ ID: "1",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID1",
+ },
+ },
+ {
+ ID: "2",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID2",
+ },
+ },
+ {
+ ID: "3",
+ Healthy: true,
+ HwLocality: &device.DeviceLocality{
+ PciBusID: "pciBusID3",
+ },
+ },
+ },
+ Attributes: map[string]string{
+ MemoryMiBAttr: "10",
+ DriverVersionAttr: "1",
+ PowerWAttr: "100",
+ BAR1MiBAttr: "256",
+ PCIBandwidthMBPerSAttr: "1",
+ CoresClockMHzAttr: "1",
+ MemoryClockMHzAttr: "1",
+ DisplayStateAttr: "Enabled",
+ PersistenceModeAttr: "Enabled",
+ },
+ },
+ },
+ },
+ },
+ {
+ Name: "Check that not working driver returns empty fingeprint data",
+ Device: &NvidiaDevice{
+ nvmlClientInitializationError: errors.New(""),
+ nvmlClient: &MockNvmlClient{
+ FingerprintResponseReturned: &nvml.FingerprintData{
+ DriverVersion: "1",
+ Devices: []*nvml.FingerprintDeviceData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "1",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "2",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ },
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "3",
+ DeviceName: helper.StringToPtr("Name1"),
+ MemoryMiB: helper.Uint64ToPtr(10),
+ },
+ },
+ },
+ },
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.FingerprintResponse{},
+ },
+ } {
+ outCh := make(chan *device.FingerprintResponse)
+ ctx, cancel := context.WithCancel(context.Background())
+ go testCase.Device.fingerprint(ctx, outCh)
+ result := <-outCh
+ cancel()
+ require.New(t).Equal(result, testCase.ExpectedWriteToChannel)
+ }
+}
diff --git a/plugins/device/cmd/nvidia/nvml/client.go b/plugins/device/cmd/nvidia/nvml/client.go
new file mode 100644
index 000000000..d18dcbe1a
--- /dev/null
+++ b/plugins/device/cmd/nvidia/nvml/client.go
@@ -0,0 +1,194 @@
+package nvml
+
+import (
+ "fmt"
+)
+
+// DeviceData represents common fields for Nvidia device
+type DeviceData struct {
+ UUID string
+ DeviceName *string
+ MemoryMiB *uint64
+ PowerW *uint
+ BAR1MiB *uint64
+}
+
+// FingerprintDeviceData is a superset of DeviceData
+// it describes device specific fields returned from
+// nvml queries during fingerprinting call
+type FingerprintDeviceData struct {
+ *DeviceData
+ PCIBandwidthMBPerS *uint
+ CoresClockMHz *uint
+ MemoryClockMHz *uint
+ DisplayState string
+ PersistenceMode string
+ PCIBusID string
+}
+
+// FingerprintData represets attributes of driver/devices
+type FingerprintData struct {
+ Devices []*FingerprintDeviceData
+ DriverVersion string
+}
+
+// StatsData is a superset of DeviceData
+// it represents statistics data returned for every Nvidia device
+type StatsData struct {
+ *DeviceData
+ PowerUsageW *uint
+ GPUUtilization *uint
+ MemoryUtilization *uint
+ EncoderUtilization *uint
+ DecoderUtilization *uint
+ TemperatureC *uint
+ UsedMemoryMiB *uint64
+ BAR1UsedMiB *uint64
+ ECCErrorsL1Cache *uint64
+ ECCErrorsL2Cache *uint64
+ ECCErrorsDevice *uint64
+}
+
+// NvmlClient describes how users would use nvml library
+type NvmlClient interface {
+ GetFingerprintData() (*FingerprintData, error)
+ GetStatsData() ([]*StatsData, error)
+}
+
+// nvmlClient implements NvmlClient
+// Users of this lib are expected to use this struct via NewNvmlClient func
+type nvmlClient struct {
+ driver NvmlDriver
+}
+
+// NewNvmlClient function creates new nvmlClient with real
+// NvmlDriver implementation. Also, this func initializes NvmlDriver
+func NewNvmlClient() (*nvmlClient, error) {
+ driver := &nvmlDriver{}
+ err := driver.Initialize()
+ if err != nil {
+ return nil, err
+ }
+ return &nvmlClient{
+ driver: driver,
+ }, nil
+}
+
+// GetFingerprintData returns FingerprintData for available Nvidia devices
+func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
+ /*
+ nvml fields to be fingerprinted # nvml_library_call
+ 1 - Driver Version # nvmlSystemGetDriverVersion
+ 2 - Product Name # nvmlDeviceGetName
+ 3 - GPU UUID # nvmlDeviceGetUUID
+ 4 - Total Memory # nvmlDeviceGetMemoryInfo
+ 5 - Power # nvmlDeviceGetPowerManagementLimit
+ 6 - PCIBusID # nvmlDeviceGetPciInfo
+ 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo(
+ 8 - PCI Bandwidth
+ 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo
+ 10 - Display Mode # nvmlDeviceGetDisplayMode
+ 11 - Persistence Mode # nvmlDeviceGetPersistenceMode
+ */
+
+ // Assumed that this method is called with receiver retrieved from
+ // NewNvmlClient
+ // because this method handles initialization of NVML library
+
+ driverVersion, err := c.driver.SystemDriverVersion()
+ if err != nil {
+ return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
+ }
+
+ numDevices, err := c.driver.DeviceCount()
+ if err != nil {
+ return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
+ }
+
+ allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
+
+ for i := 0; i < int(numDevices); i++ {
+ deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
+ if err != nil {
+ return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
+ }
+
+ allNvidiaGPUResources[i] = &FingerprintDeviceData{
+ DeviceData: &DeviceData{
+ DeviceName: deviceInfo.Name,
+ UUID: deviceInfo.UUID,
+ MemoryMiB: deviceInfo.MemoryMiB,
+ PowerW: deviceInfo.PowerW,
+ BAR1MiB: deviceInfo.BAR1MiB,
+ },
+ PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
+ CoresClockMHz: deviceInfo.CoresClockMHz,
+ MemoryClockMHz: deviceInfo.MemoryClockMHz,
+ DisplayState: deviceInfo.DisplayState,
+ PersistenceMode: deviceInfo.PersistenceMode,
+ PCIBusID: deviceInfo.PCIBusID,
+ }
+ }
+ return &FingerprintData{
+ Devices: allNvidiaGPUResources,
+ DriverVersion: driverVersion,
+ }, nil
+}
+
+// GetStatsData returns statistics data for all devices on this machine
+func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
+ /*
+ nvml fields to be reported to stats api # nvml_library_call
+ 1 - Used Memory # nvmlDeviceGetMemoryInfo
+ 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates
+ 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates
+ 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization
+ 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization
+ 6 - Current GPU Temperature # nvmlDeviceGetTemperature
+ 7 - Power Draw # nvmlDeviceGetPowerUsage
+ 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo
+ 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter
+ 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter
+ 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
+ */
+
+ // Assumed that this method is called with receiver retrieved from
+ // NewNvmlClient
+ // because this method handles initialization of NVML library
+
+ numDevices, err := c.driver.DeviceCount()
+ if err != nil {
+ return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
+ }
+
+ allNvidiaGPUStats := make([]*StatsData, numDevices)
+
+ for i := 0; i < int(numDevices); i++ {
+ deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
+ if err != nil {
+ return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
+ }
+
+ allNvidiaGPUStats[i] = &StatsData{
+ DeviceData: &DeviceData{
+ DeviceName: deviceInfo.Name,
+ UUID: deviceInfo.UUID,
+ MemoryMiB: deviceInfo.MemoryMiB,
+ PowerW: deviceInfo.PowerW,
+ BAR1MiB: deviceInfo.BAR1MiB,
+ },
+ PowerUsageW: deviceStatus.PowerUsageW,
+ GPUUtilization: deviceStatus.GPUUtilization,
+ MemoryUtilization: deviceStatus.MemoryUtilization,
+ EncoderUtilization: deviceStatus.EncoderUtilization,
+ DecoderUtilization: deviceStatus.DecoderUtilization,
+ TemperatureC: deviceStatus.TemperatureC,
+ UsedMemoryMiB: deviceStatus.UsedMemoryMiB,
+ BAR1UsedMiB: deviceStatus.BAR1UsedMiB,
+ ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache,
+ ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache,
+ ECCErrorsDevice: deviceStatus.ECCErrorsDevice,
+ }
+ }
+ return allNvidiaGPUStats, nil
+}
diff --git a/plugins/device/cmd/nvidia/nvml/client_test.go b/plugins/device/cmd/nvidia/nvml/client_test.go
new file mode 100644
index 000000000..23731f7b0
--- /dev/null
+++ b/plugins/device/cmd/nvidia/nvml/client_test.go
@@ -0,0 +1,399 @@
+package nvml
+
+import (
+ "errors"
+ "testing"
+
+ "github.com/hashicorp/nomad/helper"
+ "github.com/stretchr/testify/require"
+)
+
+type MockNVMLDriver struct {
+ systemDriverCallSuccessful bool
+ deviceCountCallSuccessful bool
+ deviceInfoByIndexCallSuccessful bool
+ deviceInfoAndStatusByIndexCallSuccessful bool
+ driverVersion string
+ devices []*DeviceInfo
+ deviceStatus []*DeviceStatus
+}
+
+func (m *MockNVMLDriver) Initialize() error {
+ return nil
+}
+
+func (m *MockNVMLDriver) Shutdown() error {
+ return nil
+}
+
+func (m *MockNVMLDriver) SystemDriverVersion() (string, error) {
+ if !m.systemDriverCallSuccessful {
+ return "", errors.New("failed to get system driver")
+ }
+ return m.driverVersion, nil
+}
+
+func (m *MockNVMLDriver) DeviceCount() (uint, error) {
+ if !m.deviceCountCallSuccessful {
+ return 0, errors.New("failed to get device length")
+ }
+ return uint(len(m.devices)), nil
+}
+
+func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
+ if index >= uint(len(m.devices)) {
+ return nil, errors.New("index is out of range")
+ }
+ if !m.deviceInfoByIndexCallSuccessful {
+ return nil, errors.New("failed to get device info by index")
+ }
+ return m.devices[index], nil
+}
+
+func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
+ if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) {
+ return nil, nil, errors.New("index is out of range")
+ }
+ if !m.deviceInfoAndStatusByIndexCallSuccessful {
+ return nil, nil, errors.New("failed to get device info and status by index")
+ }
+ return m.devices[index], m.deviceStatus[index], nil
+}
+
+func TestGetFingerprintDataFromNVML(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ DriverConfiguration *MockNVMLDriver
+ ExpectedError bool
+ ExpectedResult *FingerprintData
+ }{
+ {
+ Name: "fail on systemDriverCallSuccessful",
+ ExpectedError: true,
+ ExpectedResult: nil,
+ DriverConfiguration: &MockNVMLDriver{
+ systemDriverCallSuccessful: false,
+ deviceCountCallSuccessful: true,
+ deviceInfoByIndexCallSuccessful: true,
+ },
+ },
+ {
+ Name: "fail on deviceCountCallSuccessful",
+ ExpectedError: true,
+ ExpectedResult: nil,
+ DriverConfiguration: &MockNVMLDriver{
+ systemDriverCallSuccessful: true,
+ deviceCountCallSuccessful: false,
+ deviceInfoByIndexCallSuccessful: true,
+ },
+ },
+ {
+ Name: "fail on deviceInfoByIndexCall",
+ ExpectedError: true,
+ ExpectedResult: nil,
+ DriverConfiguration: &MockNVMLDriver{
+ systemDriverCallSuccessful: true,
+ deviceCountCallSuccessful: true,
+ deviceInfoByIndexCallSuccessful: false,
+ devices: []*DeviceInfo{
+ {
+ UUID: "UUID1",
+ Name: helper.StringToPtr("ModelName1"),
+ MemoryMiB: helper.Uint64ToPtr(16),
+ PCIBusID: "busId",
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(100),
+ PCIBandwidthMBPerS: helper.UintToPtr(100),
+ CoresClockMHz: helper.UintToPtr(100),
+ MemoryClockMHz: helper.UintToPtr(100),
+ }, {
+ UUID: "UUID2",
+ Name: helper.StringToPtr("ModelName2"),
+ MemoryMiB: helper.Uint64ToPtr(8),
+ PCIBusID: "busId",
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(100),
+ PCIBandwidthMBPerS: helper.UintToPtr(100),
+ CoresClockMHz: helper.UintToPtr(100),
+ MemoryClockMHz: helper.UintToPtr(100),
+ },
+ },
+ },
+ },
+ {
+ Name: "successful outcome",
+ ExpectedError: false,
+ ExpectedResult: &FingerprintData{
+ DriverVersion: "driverVersion",
+ Devices: []*FingerprintDeviceData{
+ {
+ DeviceData: &DeviceData{
+ DeviceName: helper.StringToPtr("ModelName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(16),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(100),
+ },
+ PCIBusID: "busId1",
+ PCIBandwidthMBPerS: helper.UintToPtr(100),
+ CoresClockMHz: helper.UintToPtr(100),
+ MemoryClockMHz: helper.UintToPtr(100),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ }, {
+ DeviceData: &DeviceData{
+ DeviceName: helper.StringToPtr("ModelName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(8),
+ PowerW: helper.UintToPtr(200),
+ BAR1MiB: helper.Uint64ToPtr(200),
+ },
+ PCIBusID: "busId2",
+ PCIBandwidthMBPerS: helper.UintToPtr(200),
+ CoresClockMHz: helper.UintToPtr(200),
+ MemoryClockMHz: helper.UintToPtr(200),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ },
+ DriverConfiguration: &MockNVMLDriver{
+ systemDriverCallSuccessful: true,
+ deviceCountCallSuccessful: true,
+ deviceInfoByIndexCallSuccessful: true,
+ driverVersion: "driverVersion",
+ devices: []*DeviceInfo{
+ {
+ UUID: "UUID1",
+ Name: helper.StringToPtr("ModelName1"),
+ MemoryMiB: helper.Uint64ToPtr(16),
+ PCIBusID: "busId1",
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(100),
+ PCIBandwidthMBPerS: helper.UintToPtr(100),
+ CoresClockMHz: helper.UintToPtr(100),
+ MemoryClockMHz: helper.UintToPtr(100),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ }, {
+ UUID: "UUID2",
+ Name: helper.StringToPtr("ModelName2"),
+ MemoryMiB: helper.Uint64ToPtr(8),
+ PCIBusID: "busId2",
+ PowerW: helper.UintToPtr(200),
+ BAR1MiB: helper.Uint64ToPtr(200),
+ PCIBandwidthMBPerS: helper.UintToPtr(200),
+ CoresClockMHz: helper.UintToPtr(200),
+ MemoryClockMHz: helper.UintToPtr(200),
+ DisplayState: "Enabled",
+ PersistenceMode: "Enabled",
+ },
+ },
+ },
+ },
+ } {
+ cli := nvmlClient{driver: testCase.DriverConfiguration}
+ fingerprintData, err := cli.GetFingerprintData()
+ if testCase.ExpectedError && err == nil {
+ t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
+ }
+ if !testCase.ExpectedError && err != nil {
+ t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
+ }
+ require.New(t).Equal(testCase.ExpectedResult, fingerprintData)
+ }
+}
+
+func TestGetStatsDataFromNVML(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ DriverConfiguration *MockNVMLDriver
+ ExpectedError bool
+ ExpectedResult []*StatsData
+ }{
+ {
+ Name: "fail on deviceCountCallSuccessful",
+ ExpectedError: true,
+ ExpectedResult: nil,
+ DriverConfiguration: &MockNVMLDriver{
+ systemDriverCallSuccessful: true,
+ deviceCountCallSuccessful: false,
+ deviceInfoByIndexCallSuccessful: true,
+ deviceInfoAndStatusByIndexCallSuccessful: true,
+ },
+ },
+ {
+ Name: "fail on DeviceInfoAndStatusByIndex call",
+ ExpectedError: true,
+ ExpectedResult: nil,
+ DriverConfiguration: &MockNVMLDriver{
+ systemDriverCallSuccessful: true,
+ deviceCountCallSuccessful: true,
+ deviceInfoAndStatusByIndexCallSuccessful: false,
+ devices: []*DeviceInfo{
+ {
+ UUID: "UUID1",
+ Name: helper.StringToPtr("ModelName1"),
+ MemoryMiB: helper.Uint64ToPtr(16),
+ PCIBusID: "busId1",
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(100),
+ PCIBandwidthMBPerS: helper.UintToPtr(100),
+ CoresClockMHz: helper.UintToPtr(100),
+ MemoryClockMHz: helper.UintToPtr(100),
+ }, {
+ UUID: "UUID2",
+ Name: helper.StringToPtr("ModelName2"),
+ MemoryMiB: helper.Uint64ToPtr(8),
+ PCIBusID: "busId2",
+ PowerW: helper.UintToPtr(200),
+ BAR1MiB: helper.Uint64ToPtr(200),
+ PCIBandwidthMBPerS: helper.UintToPtr(200),
+ CoresClockMHz: helper.UintToPtr(200),
+ MemoryClockMHz: helper.UintToPtr(200),
+ },
+ },
+ deviceStatus: []*DeviceStatus{
+ {
+ TemperatureC: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(1),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(1),
+ ECCErrorsDevice: helper.Uint64ToPtr(1),
+ PowerUsageW: helper.UintToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ },
+ {
+ TemperatureC: helper.UintToPtr(2),
+ GPUUtilization: helper.UintToPtr(2),
+ MemoryUtilization: helper.UintToPtr(2),
+ EncoderUtilization: helper.UintToPtr(2),
+ DecoderUtilization: helper.UintToPtr(2),
+ UsedMemoryMiB: helper.Uint64ToPtr(2),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(2),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(2),
+ ECCErrorsDevice: helper.Uint64ToPtr(2),
+ PowerUsageW: helper.UintToPtr(2),
+ BAR1UsedMiB: helper.Uint64ToPtr(2),
+ },
+ },
+ },
+ },
+ {
+ Name: "successful outcome",
+ ExpectedError: false,
+ ExpectedResult: []*StatsData{
+ {
+ DeviceData: &DeviceData{
+ DeviceName: helper.StringToPtr("ModelName1"),
+ UUID: "UUID1",
+ MemoryMiB: helper.Uint64ToPtr(16),
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(100),
+ },
+ TemperatureC: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(1),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(1),
+ ECCErrorsDevice: helper.Uint64ToPtr(1),
+ PowerUsageW: helper.UintToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ },
+ {
+ DeviceData: &DeviceData{
+ DeviceName: helper.StringToPtr("ModelName2"),
+ UUID: "UUID2",
+ MemoryMiB: helper.Uint64ToPtr(8),
+ PowerW: helper.UintToPtr(200),
+ BAR1MiB: helper.Uint64ToPtr(200),
+ },
+ TemperatureC: helper.UintToPtr(2),
+ GPUUtilization: helper.UintToPtr(2),
+ MemoryUtilization: helper.UintToPtr(2),
+ EncoderUtilization: helper.UintToPtr(2),
+ DecoderUtilization: helper.UintToPtr(2),
+ UsedMemoryMiB: helper.Uint64ToPtr(2),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(2),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(2),
+ ECCErrorsDevice: helper.Uint64ToPtr(2),
+ PowerUsageW: helper.UintToPtr(2),
+ BAR1UsedMiB: helper.Uint64ToPtr(2),
+ },
+ },
+ DriverConfiguration: &MockNVMLDriver{
+ deviceCountCallSuccessful: true,
+ deviceInfoByIndexCallSuccessful: true,
+ deviceInfoAndStatusByIndexCallSuccessful: true,
+ devices: []*DeviceInfo{
+ {
+ UUID: "UUID1",
+ Name: helper.StringToPtr("ModelName1"),
+ MemoryMiB: helper.Uint64ToPtr(16),
+ PCIBusID: "busId1",
+ PowerW: helper.UintToPtr(100),
+ BAR1MiB: helper.Uint64ToPtr(100),
+ PCIBandwidthMBPerS: helper.UintToPtr(100),
+ CoresClockMHz: helper.UintToPtr(100),
+ MemoryClockMHz: helper.UintToPtr(100),
+ }, {
+ UUID: "UUID2",
+ Name: helper.StringToPtr("ModelName2"),
+ MemoryMiB: helper.Uint64ToPtr(8),
+ PCIBusID: "busId2",
+ PowerW: helper.UintToPtr(200),
+ BAR1MiB: helper.Uint64ToPtr(200),
+ PCIBandwidthMBPerS: helper.UintToPtr(200),
+ CoresClockMHz: helper.UintToPtr(200),
+ MemoryClockMHz: helper.UintToPtr(200),
+ },
+ },
+ deviceStatus: []*DeviceStatus{
+ {
+ TemperatureC: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(1),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(1),
+ ECCErrorsDevice: helper.Uint64ToPtr(1),
+ PowerUsageW: helper.UintToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ },
+ {
+ TemperatureC: helper.UintToPtr(2),
+ GPUUtilization: helper.UintToPtr(2),
+ MemoryUtilization: helper.UintToPtr(2),
+ EncoderUtilization: helper.UintToPtr(2),
+ DecoderUtilization: helper.UintToPtr(2),
+ UsedMemoryMiB: helper.Uint64ToPtr(2),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(2),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(2),
+ ECCErrorsDevice: helper.Uint64ToPtr(2),
+ PowerUsageW: helper.UintToPtr(2),
+ BAR1UsedMiB: helper.Uint64ToPtr(2),
+ },
+ },
+ },
+ },
+ } {
+ cli := nvmlClient{driver: testCase.DriverConfiguration}
+ statsData, err := cli.GetStatsData()
+ if testCase.ExpectedError && err == nil {
+ t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
+ }
+ if !testCase.ExpectedError && err != nil {
+ t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
+ }
+ require.New(t).Equal(testCase.ExpectedResult, statsData)
+ }
+}
diff --git a/plugins/device/cmd/nvidia/nvml/driver.go b/plugins/device/cmd/nvidia/nvml/driver.go
new file mode 100644
index 000000000..ef1ba57c4
--- /dev/null
+++ b/plugins/device/cmd/nvidia/nvml/driver.go
@@ -0,0 +1,138 @@
+package nvml
+
+import (
+ "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
+)
+
+// DeviceInfo represents nvml device data
+// this struct is returned by NvmlDriver DeviceInfoByIndex and
+// DeviceInfoAndStatusByIndex methods
+type DeviceInfo struct {
+ // The following fields are guaranteed to be retrieved from nvml
+ UUID string
+ PCIBusID string
+ DisplayState string
+ PersistenceMode string
+
+ // The following fields can be nil after call to nvml, because nvml was
+ // not able to retrieve this fields for specific nvidia card
+ Name *string
+ MemoryMiB *uint64
+ PowerW *uint
+ BAR1MiB *uint64
+ PCIBandwidthMBPerS *uint
+ CoresClockMHz *uint
+ MemoryClockMHz *uint
+}
+
+// DeviceStatus represents nvml device status
+// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
+type DeviceStatus struct {
+ // The following fields can be nil after call to nvml, because nvml was
+ // not able to retrieve this fields for specific nvidia card
+ PowerUsageW *uint
+ TemperatureC *uint
+ GPUUtilization *uint // %
+ MemoryUtilization *uint // %
+ EncoderUtilization *uint // %
+ DecoderUtilization *uint // %
+ BAR1UsedMiB *uint64
+ UsedMemoryMiB *uint64
+ ECCErrorsL1Cache *uint64
+ ECCErrorsL2Cache *uint64
+ ECCErrorsDevice *uint64
+}
+
+// NvmlDriver represents set of methods to query nvml library
+type NvmlDriver interface {
+ Initialize() error
+ Shutdown() error
+ SystemDriverVersion() (string, error)
+ DeviceCount() (uint, error)
+ DeviceInfoByIndex(uint) (*DeviceInfo, error)
+ DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
+}
+
+// nvmlDriver implements NvmlDriver
+// Users are required to call Initialize method before using any other methods
+type nvmlDriver struct{}
+
+// Initialize nvml library by locating nvml shared object file and calling ldopen
+func (n *nvmlDriver) Initialize() error {
+ return nvml.Init()
+}
+
+// Shutdown stops any further interaction with nvml
+func (n *nvmlDriver) Shutdown() error {
+ return nvml.Shutdown()
+}
+
+// SystemDriverVersion returns installed driver version
+func (n *nvmlDriver) SystemDriverVersion() (string, error) {
+ return nvml.GetDriverVersion()
+}
+
+// DeviceCount reports number of available GPU devices
+func (n *nvmlDriver) DeviceCount() (uint, error) {
+ return nvml.GetDeviceCount()
+}
+
+// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
+func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
+ device, err := nvml.NewDevice(index)
+ if err != nil {
+ return nil, err
+ }
+ deviceMode, err := device.GetDeviceMode()
+ if err != nil {
+ return nil, err
+ }
+ return &DeviceInfo{
+ UUID: device.UUID,
+ Name: device.Model,
+ MemoryMiB: device.Memory,
+ PowerW: device.Power,
+ BAR1MiB: device.PCI.BAR1,
+ PCIBandwidthMBPerS: device.PCI.Bandwidth,
+ PCIBusID: device.PCI.BusID,
+ CoresClockMHz: device.Clocks.Cores,
+ MemoryClockMHz: device.Clocks.Memory,
+ DisplayState: deviceMode.DisplayInfo.Mode.String(),
+ PersistenceMode: deviceMode.Persistence.String(),
+ }, nil
+}
+
+// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
+func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
+ device, err := nvml.NewDevice(index)
+ if err != nil {
+ return nil, nil, err
+ }
+ status, err := device.Status()
+ if err != nil {
+ return nil, nil, err
+ }
+ return &DeviceInfo{
+ UUID: device.UUID,
+ Name: device.Model,
+ MemoryMiB: device.Memory,
+ PowerW: device.Power,
+ BAR1MiB: device.PCI.BAR1,
+ PCIBandwidthMBPerS: device.PCI.Bandwidth,
+ PCIBusID: device.PCI.BusID,
+ CoresClockMHz: device.Clocks.Cores,
+ MemoryClockMHz: device.Clocks.Memory,
+ }, &DeviceStatus{
+ TemperatureC: status.Temperature,
+ GPUUtilization: status.Utilization.GPU,
+ MemoryUtilization: status.Utilization.Memory,
+ EncoderUtilization: status.Utilization.Encoder,
+ DecoderUtilization: status.Utilization.Decoder,
+ UsedMemoryMiB: status.Memory.Global.Used,
+ ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache,
+ ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache,
+ ECCErrorsDevice: status.Memory.ECCErrors.Device,
+ PowerUsageW: status.Power,
+ BAR1UsedMiB: status.PCI.BAR1Used,
+ }, nil
+}
diff --git a/plugins/device/cmd/nvidia/stats.go b/plugins/device/cmd/nvidia/stats.go
new file mode 100644
index 000000000..022c710fc
--- /dev/null
+++ b/plugins/device/cmd/nvidia/stats.go
@@ -0,0 +1,301 @@
+package nvidia
+
+import (
+ "context"
+ "time"
+
+ "github.com/hashicorp/nomad/plugins/device"
+ "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+)
+
+const (
+ // Attribute names for reporting stats output
+ PowerUsageAttr = "Power usage"
+ PowerUsageUnit = "W"
+ PowerUsageDesc = "Power usage for this GPU in watts and " +
+ "its associated circuitry (e.g. memory) / Maximum GPU Power"
+ GPUUtilizationAttr = "GPU utilization"
+ GPUUtilizationUnit = "%"
+ GPUUtilizationDesc = "Percent of time over the past sample period " +
+ "during which one or more kernels were executing on the GPU."
+ MemoryUtilizationAttr = "Memory utilization"
+ MemoryUtilizationUnit = "%"
+ MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period"
+ EncoderUtilizationAttr = "Encoder utilization"
+ EncoderUtilizationUnit = "%"
+ EncoderUtilizationDesc = "Percent of time over the past sample period " +
+ "during which GPU Encoder was used"
+ DecoderUtilizationAttr = "Decoder utilization"
+ DecoderUtilizationUnit = "%"
+ DecoderUtilizationDesc = "Percent of time over the past sample period " +
+ "during which GPU Decoder was used"
+ TemperatureAttr = "Temperature"
+ TemperatureUnit = "C" // Celsius degrees
+ TemperatureDesc = "Temperature of the Unit"
+ MemoryStateAttr = "Memory state"
+ MemoryStateUnit = "MiB" // Mebibytes
+ MemoryStateDesc = "UsedMemory / TotalMemory"
+ BAR1StateAttr = "BAR1 buffer state"
+ BAR1StateUnit = "MiB" // Mebibytes
+ BAR1StateDesc = "UsedBAR1 / TotalBAR1"
+ ECCErrorsL1CacheAttr = "ECC L1 errors"
+ ECCErrorsL1CacheUnit = "#" // number of errors
+ ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
+ ECCErrorsL2CacheAttr = "ECC L2 errors"
+ ECCErrorsL2CacheUnit = "#" // number of errors
+ ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
+ ECCErrorsDeviceAttr = "ECC memory errors"
+ ECCErrorsDeviceUnit = "#" // number of errors
+ ECCErrorsDeviceDesc = "Requested memory error counter for the device"
+)
+
+// stats is the long running goroutine that streams device statistics
+func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) {
+ defer close(stats)
+
+ if d.nvmlClientInitializationError != nil {
+ d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError)
+ return
+ }
+
+ // Create a timer that will fire immediately for the first detection
+ ticker := time.NewTimer(0)
+
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ ticker.Reset(d.statsPeriod)
+ }
+
+ d.writeStatsToChannel(stats, time.Now())
+ }
+}
+
+// filterStatsByID accepts list of StatsData and set of IDs
+// this function would return entries from StatsData with IDs found in the set
+func filterStatsByID(stats []*nvml.StatsData, IDs map[string]struct{}) []*nvml.StatsData {
+ var filteredStats []*nvml.StatsData
+ for _, statsItem := range stats {
+ if _, ok := IDs[statsItem.UUID]; ok {
+ filteredStats = append(filteredStats, statsItem)
+ }
+ }
+ return filteredStats
+}
+
+// writeStatsToChannel collects StatsData from NVML backend, groups StatsData
+// by DeviceName attribute, populates DeviceGroupStats structure for every group
+// and sends data over provided channel
+func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
+ statsData, err := d.nvmlClient.GetStatsData()
+ if err != nil {
+ d.logger.Error("failed to get nvidia stats", "error", err)
+ stats <- &device.StatsResponse{
+ Error: err,
+ }
+ return
+ }
+
+ // filter only stats from devices that are stored in NvidiaDevice struct
+ d.deviceLock.RLock()
+ statsData = filterStatsByID(statsData, d.devices)
+ d.deviceLock.RUnlock()
+
+ // group stats by DeviceName struct field
+ statsListByDeviceName := make(map[string][]*nvml.StatsData)
+ for _, statsItem := range statsData {
+ deviceName := statsItem.DeviceName
+ if deviceName == nil {
+ // nvml driver was not able to detect device name. This kind
+ // of devices are placed to single group with 'notAvailable' name
+ notAvailableCopy := notAvailable
+ deviceName = ¬AvailableCopy
+ }
+
+ statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
+ }
+
+ // place data device.DeviceGroupStats struct for every group of stats
+ deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
+ for groupName, groupStats := range statsListByDeviceName {
+ deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
+ }
+
+ stats <- &device.StatsResponse{
+ Groups: deviceGroupsStats,
+ }
+}
+
+func newNotAvailableDeviceStats(unit, desc string) *device.StatValue {
+ return &device.StatValue{Unit: unit, Desc: desc, StringVal: notAvailable}
+}
+
+// statsForGroup is a helper function that populates device.DeviceGroupStats
+// for given groupName with groupStats list
+func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
+ instanceStats := make(map[string]*device.DeviceStats)
+ for _, statsItem := range groupStats {
+ instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
+ }
+
+ return &device.DeviceGroupStats{
+ Vendor: vendor,
+ Type: deviceType,
+ Name: groupName,
+ InstanceStats: instanceStats,
+ }
+}
+
+// statsForItem is a helper function that populates device.DeviceStats for given
+// nvml.StatsData
+func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
+ // nvml.StatsData holds pointers to values that can be nil
+ // In case they are nil return stats with 'notAvailable' constant
+ var (
+ powerUsageStat *device.StatValue
+ GPUUtilizationStat *device.StatValue
+ memoryUtilizationStat *device.StatValue
+ encoderUtilizationStat *device.StatValue
+ decoderUtilizationStat *device.StatValue
+ temperatureStat *device.StatValue
+ memoryStateStat *device.StatValue
+ BAR1StateStat *device.StatValue
+ ECCErrorsL1CacheStat *device.StatValue
+ ECCErrorsL2CacheStat *device.StatValue
+ ECCErrorsDeviceStat *device.StatValue
+ )
+
+ if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
+ powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
+ } else {
+ powerUsageStat = &device.StatValue{
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: int64(*statsItem.PowerUsageW),
+ IntDenominatorVal: int64(*statsItem.PowerW),
+ }
+ }
+
+ if statsItem.GPUUtilization == nil {
+ GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
+ } else {
+ GPUUtilizationStat = &device.StatValue{
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: int64(*statsItem.GPUUtilization),
+ }
+ }
+
+ if statsItem.MemoryUtilization == nil {
+ memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
+ } else {
+ memoryUtilizationStat = &device.StatValue{
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: int64(*statsItem.MemoryUtilization),
+ }
+ }
+
+ if statsItem.EncoderUtilization == nil {
+ encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
+ } else {
+ encoderUtilizationStat = &device.StatValue{
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: int64(*statsItem.EncoderUtilization),
+ }
+ }
+
+ if statsItem.DecoderUtilization == nil {
+ decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
+ } else {
+ decoderUtilizationStat = &device.StatValue{
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: int64(*statsItem.DecoderUtilization),
+ }
+ }
+
+ if statsItem.TemperatureC == nil {
+ temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
+ } else {
+ temperatureStat = &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: int64(*statsItem.TemperatureC),
+ }
+ }
+
+ if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
+ memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
+ } else {
+ memoryStateStat = &device.StatValue{
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: int64(*statsItem.UsedMemoryMiB),
+ IntDenominatorVal: int64(*statsItem.MemoryMiB),
+ }
+ }
+
+ if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
+ BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
+ } else {
+ BAR1StateStat = &device.StatValue{
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: int64(*statsItem.BAR1UsedMiB),
+ IntDenominatorVal: int64(*statsItem.BAR1MiB),
+ }
+ }
+
+ if statsItem.ECCErrorsL1Cache == nil {
+ ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
+ } else {
+ ECCErrorsL1CacheStat = &device.StatValue{
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: int64(*statsItem.ECCErrorsL1Cache),
+ }
+ }
+
+ if statsItem.ECCErrorsL2Cache == nil {
+ ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
+ } else {
+ ECCErrorsL2CacheStat = &device.StatValue{
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: int64(*statsItem.ECCErrorsL2Cache),
+ }
+ }
+
+ if statsItem.ECCErrorsDevice == nil {
+ ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
+ } else {
+ ECCErrorsDeviceStat = &device.StatValue{
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: int64(*statsItem.ECCErrorsDevice),
+ }
+ }
+ return &device.DeviceStats{
+ Summary: temperatureStat,
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: powerUsageStat,
+ GPUUtilizationAttr: GPUUtilizationStat,
+ MemoryUtilizationAttr: memoryUtilizationStat,
+ EncoderUtilizationAttr: encoderUtilizationStat,
+ DecoderUtilizationAttr: decoderUtilizationStat,
+ TemperatureAttr: temperatureStat,
+ MemoryStateAttr: memoryStateStat,
+ BAR1StateAttr: BAR1StateStat,
+ ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat,
+ ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat,
+ ECCErrorsDeviceAttr: ECCErrorsDeviceStat,
+ },
+ },
+ Timestamp: timestamp,
+ }
+}
diff --git a/plugins/device/cmd/nvidia/stats_test.go b/plugins/device/cmd/nvidia/stats_test.go
new file mode 100644
index 000000000..d60eb88da
--- /dev/null
+++ b/plugins/device/cmd/nvidia/stats_test.go
@@ -0,0 +1,3016 @@
+package nvidia
+
+import (
+ "errors"
+ "sort"
+ "testing"
+ "time"
+
+ hclog "github.com/hashicorp/go-hclog"
+ "github.com/hashicorp/nomad/helper"
+ "github.com/hashicorp/nomad/plugins/device"
+ "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+ "github.com/stretchr/testify/require"
+)
+
+func TestFilterStatsByID(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ ProvidedStats []*nvml.StatsData
+ ProvidedIDs map[string]struct{}
+ ExpectedResult []*nvml.StatsData
+ }{
+ {
+ Name: "All ids are in the map",
+ ProvidedStats: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ },
+ ProvidedIDs: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ ExpectedResult: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ },
+ },
+ {
+ Name: "Odd are not provided in the map",
+ ProvidedStats: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ },
+ ProvidedIDs: map[string]struct{}{
+ "UUID2": {},
+ },
+ ExpectedResult: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ },
+ },
+ {
+ Name: "Even are not provided in the map",
+ ProvidedStats: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ },
+ ProvidedIDs: map[string]struct{}{
+ "UUID1": {},
+ "UUID3": {},
+ },
+ ExpectedResult: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ },
+ },
+ {
+ Name: "No Stats were provided",
+ ProvidedIDs: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ },
+ {
+ Name: "No Ids were provided",
+ ProvidedStats: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ },
+ },
+ } {
+ actualResult := filterStatsByID(testCase.ProvidedStats, testCase.ProvidedIDs)
+ require.New(t).Equal(testCase.ExpectedResult, actualResult)
+ }
+}
+
+func TestStatsForItem(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ Timestamp time.Time
+ ItemStat *nvml.StatsData
+ ExpectedResult *device.DeviceStats
+ }{
+ {
+ Name: "All fields in ItemStat are not nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "Power usage is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: nil,
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ StringVal: notAvailable,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "PowerW is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: nil,
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ StringVal: notAvailable,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "GPUUtilization is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: nil,
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ StringVal: notAvailable,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "MemoryUtilization is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: nil,
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ StringVal: notAvailable,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "EncoderUtilization is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: nil,
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ StringVal: notAvailable,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "DecoderUtilization is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: nil,
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ StringVal: notAvailable,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "Temperature is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: nil,
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ StringVal: notAvailable,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ StringVal: notAvailable,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "UsedMemoryMiB is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: nil,
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ StringVal: notAvailable,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "MemoryMiB is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: nil,
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ StringVal: notAvailable,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "BAR1UsedMiB is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: nil,
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ StringVal: notAvailable,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "BAR1MiB is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: nil,
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ StringVal: notAvailable,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "ECCErrorsL1Cache is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: nil,
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ StringVal: notAvailable,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "ECCErrorsL2Cache is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: nil,
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ StringVal: notAvailable,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ {
+ Name: "ECCErrorsDevice is nil",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ItemStat: &nvml.StatsData{
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: nil,
+ },
+ ExpectedResult: &device.DeviceStats{
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ StringVal: notAvailable,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ } {
+ actualResult := statsForItem(testCase.ItemStat, testCase.Timestamp)
+ require.New(t).Equal(testCase.ExpectedResult, actualResult)
+ }
+}
+
+func TestStatsForGroup(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ Timestamp time.Time
+ GroupStats []*nvml.StatsData
+ GroupName string
+ ExpectedResult *device.DeviceGroupStats
+ }{
+ {
+ Name: "make sure that all data is transformed correctly",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ GroupName: "DeviceName1",
+ GroupStats: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ MemoryMiB: helper.Uint64ToPtr(2),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(2),
+ GPUUtilization: helper.UintToPtr(2),
+ MemoryUtilization: helper.UintToPtr(2),
+ EncoderUtilization: helper.UintToPtr(2),
+ DecoderUtilization: helper.UintToPtr(2),
+ TemperatureC: helper.UintToPtr(2),
+ UsedMemoryMiB: helper.Uint64ToPtr(2),
+ BAR1UsedMiB: helper.Uint64ToPtr(2),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(200),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(200),
+ ECCErrorsDevice: helper.Uint64ToPtr(200),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ MemoryMiB: helper.Uint64ToPtr(3),
+ PowerW: helper.UintToPtr(3),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(3),
+ GPUUtilization: helper.UintToPtr(3),
+ MemoryUtilization: helper.UintToPtr(3),
+ EncoderUtilization: helper.UintToPtr(3),
+ DecoderUtilization: helper.UintToPtr(3),
+ TemperatureC: helper.UintToPtr(3),
+ UsedMemoryMiB: helper.Uint64ToPtr(3),
+ BAR1UsedMiB: helper.Uint64ToPtr(3),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(300),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(300),
+ ECCErrorsDevice: helper.Uint64ToPtr(300),
+ },
+ },
+ ExpectedResult: &device.DeviceGroupStats{
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName1",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID1": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ "UUID2": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 200,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ "UUID3": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 3,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 3,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 3,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 3,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 300,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 300,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 300,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ },
+ } {
+ actualResult := statsForGroup(testCase.GroupName, testCase.GroupStats, testCase.Timestamp)
+ require.New(t).Equal(testCase.ExpectedResult, actualResult)
+ }
+}
+
+func TestWriteStatsToChannel(t *testing.T) {
+ for _, testCase := range []struct {
+ Name string
+ ExpectedWriteToChannel *device.StatsResponse
+ Timestamp time.Time
+ Device *NvidiaDevice
+ }{
+ {
+ Name: "NVML wrapper returns error",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ ExpectedWriteToChannel: &device.StatsResponse{
+ Error: errors.New(""),
+ },
+ Device: &NvidiaDevice{
+ nvmlClient: &MockNvmlClient{
+ StatsError: errors.New(""),
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ },
+ {
+ Name: "Check that stats with multiple DeviceNames are assigned to different groups",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ Device: &NvidiaDevice{
+ devices: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ nvmlClient: &MockNvmlClient{
+ StatsResponseReturned: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ MemoryMiB: helper.Uint64ToPtr(2),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(2),
+ GPUUtilization: helper.UintToPtr(2),
+ MemoryUtilization: helper.UintToPtr(2),
+ EncoderUtilization: helper.UintToPtr(2),
+ DecoderUtilization: helper.UintToPtr(2),
+ TemperatureC: helper.UintToPtr(2),
+ UsedMemoryMiB: helper.Uint64ToPtr(2),
+ BAR1UsedMiB: helper.Uint64ToPtr(2),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(200),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(200),
+ ECCErrorsDevice: helper.Uint64ToPtr(200),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ MemoryMiB: helper.Uint64ToPtr(3),
+ PowerW: helper.UintToPtr(3),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(3),
+ GPUUtilization: helper.UintToPtr(3),
+ MemoryUtilization: helper.UintToPtr(3),
+ EncoderUtilization: helper.UintToPtr(3),
+ DecoderUtilization: helper.UintToPtr(3),
+ TemperatureC: helper.UintToPtr(3),
+ UsedMemoryMiB: helper.Uint64ToPtr(3),
+ BAR1UsedMiB: helper.Uint64ToPtr(3),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(300),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(300),
+ ECCErrorsDevice: helper.Uint64ToPtr(300),
+ },
+ },
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.StatsResponse{
+ Groups: []*device.DeviceGroupStats{
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName1",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID1": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName2",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID2": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 200,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName3",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID3": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 3,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 3,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 3,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 3,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 300,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 300,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 300,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ },
+ },
+ },
+ {
+ Name: "Check that stats with multiple DeviceNames are assigned to different groups 2",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ Device: &NvidiaDevice{
+ devices: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ "UUID3": {},
+ },
+ nvmlClient: &MockNvmlClient{
+ StatsResponseReturned: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ MemoryMiB: helper.Uint64ToPtr(2),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(2),
+ GPUUtilization: helper.UintToPtr(2),
+ MemoryUtilization: helper.UintToPtr(2),
+ EncoderUtilization: helper.UintToPtr(2),
+ DecoderUtilization: helper.UintToPtr(2),
+ TemperatureC: helper.UintToPtr(2),
+ UsedMemoryMiB: helper.Uint64ToPtr(2),
+ BAR1UsedMiB: helper.Uint64ToPtr(2),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(200),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(200),
+ ECCErrorsDevice: helper.Uint64ToPtr(200),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ MemoryMiB: helper.Uint64ToPtr(3),
+ PowerW: helper.UintToPtr(3),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(3),
+ GPUUtilization: helper.UintToPtr(3),
+ MemoryUtilization: helper.UintToPtr(3),
+ EncoderUtilization: helper.UintToPtr(3),
+ DecoderUtilization: helper.UintToPtr(3),
+ TemperatureC: helper.UintToPtr(3),
+ UsedMemoryMiB: helper.Uint64ToPtr(3),
+ BAR1UsedMiB: helper.Uint64ToPtr(3),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(300),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(300),
+ ECCErrorsDevice: helper.Uint64ToPtr(300),
+ },
+ },
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.StatsResponse{
+ Groups: []*device.DeviceGroupStats{
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName1",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID1": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName2",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID3": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 3,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 3,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 3,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 3,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 3,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 3,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 300,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 300,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 300,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ "UUID2": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 200,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ },
+ },
+ },
+ {
+ Name: "Check that only devices from NvidiaDevice.device map stats are reported",
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ Device: &NvidiaDevice{
+ devices: map[string]struct{}{
+ "UUID1": {},
+ "UUID2": {},
+ },
+ nvmlClient: &MockNvmlClient{
+ StatsResponseReturned: []*nvml.StatsData{
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID1",
+ DeviceName: helper.StringToPtr("DeviceName1"),
+ MemoryMiB: helper.Uint64ToPtr(1),
+ PowerW: helper.UintToPtr(1),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(1),
+ GPUUtilization: helper.UintToPtr(1),
+ MemoryUtilization: helper.UintToPtr(1),
+ EncoderUtilization: helper.UintToPtr(1),
+ DecoderUtilization: helper.UintToPtr(1),
+ TemperatureC: helper.UintToPtr(1),
+ UsedMemoryMiB: helper.Uint64ToPtr(1),
+ BAR1UsedMiB: helper.Uint64ToPtr(1),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(100),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(100),
+ ECCErrorsDevice: helper.Uint64ToPtr(100),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID2",
+ DeviceName: helper.StringToPtr("DeviceName2"),
+ MemoryMiB: helper.Uint64ToPtr(2),
+ PowerW: helper.UintToPtr(2),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(2),
+ GPUUtilization: helper.UintToPtr(2),
+ MemoryUtilization: helper.UintToPtr(2),
+ EncoderUtilization: helper.UintToPtr(2),
+ DecoderUtilization: helper.UintToPtr(2),
+ TemperatureC: helper.UintToPtr(2),
+ UsedMemoryMiB: helper.Uint64ToPtr(2),
+ BAR1UsedMiB: helper.Uint64ToPtr(2),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(200),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(200),
+ ECCErrorsDevice: helper.Uint64ToPtr(200),
+ },
+ {
+ DeviceData: &nvml.DeviceData{
+ UUID: "UUID3",
+ DeviceName: helper.StringToPtr("DeviceName3"),
+ MemoryMiB: helper.Uint64ToPtr(3),
+ PowerW: helper.UintToPtr(3),
+ BAR1MiB: helper.Uint64ToPtr(256),
+ },
+ PowerUsageW: helper.UintToPtr(3),
+ GPUUtilization: helper.UintToPtr(3),
+ MemoryUtilization: helper.UintToPtr(3),
+ EncoderUtilization: helper.UintToPtr(3),
+ DecoderUtilization: helper.UintToPtr(3),
+ TemperatureC: helper.UintToPtr(3),
+ UsedMemoryMiB: helper.Uint64ToPtr(3),
+ BAR1UsedMiB: helper.Uint64ToPtr(3),
+ ECCErrorsL1Cache: helper.Uint64ToPtr(300),
+ ECCErrorsL2Cache: helper.Uint64ToPtr(300),
+ ECCErrorsDevice: helper.Uint64ToPtr(300),
+ },
+ },
+ },
+ logger: hclog.NewNullLogger(),
+ },
+ ExpectedWriteToChannel: &device.StatsResponse{
+ Groups: []*device.DeviceGroupStats{
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName1",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID1": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 1,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 1,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 1,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 1,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 100,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 100,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ {
+ Vendor: vendor,
+ Type: deviceType,
+ Name: "DeviceName2",
+ InstanceStats: map[string]*device.DeviceStats{
+ "UUID2": {
+ Summary: &device.StatValue{
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ Stats: &device.StatObject{
+ Attributes: map[string]*device.StatValue{
+ PowerUsageAttr: {
+ Unit: PowerUsageUnit,
+ Desc: PowerUsageDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ GPUUtilizationAttr: {
+ Unit: GPUUtilizationUnit,
+ Desc: GPUUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryUtilizationAttr: {
+ Unit: MemoryUtilizationUnit,
+ Desc: MemoryUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ EncoderUtilizationAttr: {
+ Unit: EncoderUtilizationUnit,
+ Desc: EncoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ DecoderUtilizationAttr: {
+ Unit: DecoderUtilizationUnit,
+ Desc: DecoderUtilizationDesc,
+ IntNumeratorVal: 2,
+ },
+ TemperatureAttr: {
+ Unit: TemperatureUnit,
+ Desc: TemperatureDesc,
+ IntNumeratorVal: 2,
+ },
+ MemoryStateAttr: {
+ Unit: MemoryStateUnit,
+ Desc: MemoryStateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 2,
+ },
+ BAR1StateAttr: {
+ Unit: BAR1StateUnit,
+ Desc: BAR1StateDesc,
+ IntNumeratorVal: 2,
+ IntDenominatorVal: 256,
+ },
+ ECCErrorsL1CacheAttr: {
+ Unit: ECCErrorsL1CacheUnit,
+ Desc: ECCErrorsL1CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsL2CacheAttr: {
+ Unit: ECCErrorsL2CacheUnit,
+ Desc: ECCErrorsL2CacheDesc,
+ IntNumeratorVal: 200,
+ },
+ ECCErrorsDeviceAttr: {
+ Unit: ECCErrorsDeviceUnit,
+ Desc: ECCErrorsDeviceDesc,
+ IntNumeratorVal: 200,
+ },
+ },
+ },
+ Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
+ },
+ },
+ },
+ },
+ },
+ },
+ } {
+ channel := make(chan *device.StatsResponse, 1)
+ testCase.Device.writeStatsToChannel(channel, testCase.Timestamp)
+ actualResult := <-channel
+ // writeStatsToChannel iterates over map keys
+ // and insterts results to an array, so order of elements in output array
+ // may be different
+ // actualResult, expectedWriteToChannel arrays has to be sorted firsted
+ sort.Slice(actualResult.Groups, func(i, j int) bool {
+ return actualResult.Groups[i].Name < actualResult.Groups[j].Name
+ })
+ sort.Slice(testCase.ExpectedWriteToChannel.Groups, func(i, j int) bool {
+ return testCase.ExpectedWriteToChannel.Groups[i].Name < testCase.ExpectedWriteToChannel.Groups[j].Name
+ })
+ require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult)
+ }
+}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA b/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA
new file mode 100644
index 000000000..1001ecb5f
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA
@@ -0,0 +1,160 @@
+ GPU Monitoring Tools
+ Software Grant and Corporate Contributor License Agreement ("Agreement")
+
+ Thank you for your interest in the gpu-monitoring-tools Project (the
+ "Project"). In order to clarify the intellectual property license
+ granted with Contributions from any person or entity, NVIDIA
+ Corporation (the “Copyright Holders") must have a Contributor License
+ Agreement (CLA) on file that has been signed by each Contributor,
+ indicating agreement to the license terms below. This license is
+ for your protection as a Contributor as well as the protection of the
+ Project and its users; it does not change your rights to use your own
+ Contributions for any other purpose.
+
+ This version of the Agreement allows an entity (the "Corporation") to
+ submit Contributions to the Project, to authorize Contributions
+ submitted by its designated employees to the Project, and to grant
+ copyright and patent licenses thereto to the Copyright Holders.
+
+ If you have not already done so, please complete and sign, then scan and
+ email a pdf file of this Agreement to digits@nvidia.com.
+ Please read this document carefully before signing and keep a copy for
+ your records.
+
+ Corporation name: ________________________________________________
+
+ Corporation address: ________________________________________________
+
+ ________________________________________________
+
+ ________________________________________________
+
+ Point of Contact: ________________________________________________
+
+ E-Mail: ________________________________________________
+
+ Telephone: _____________________ Fax: _____________________
+
+
+ You accept and agree to the following terms and conditions for Your
+ present and future Contributions submitted to the Project. In
+ return, the Copyright Holders shall not use Your Contributions in a way
+ that is contrary to the public benefit or inconsistent with its nonprofit
+ status and bylaws in effect at the time of the Contribution. Except
+ for the license granted herein to the Copyright Holders and recipients of
+ software distributed by the Copyright Holders, You reserve all right, title,
+ and interest in and to Your Contributions.
+
+ 1. Definitions.
+
+ "You" (or "Your") shall mean the copyright owner or legal entity
+ authorized by the copyright owner that is making this Agreement
+ with the Copyright Holders. For legal entities, the entity making a
+ Contribution and all other entities that control, are controlled by,
+ or are under common control with that entity are considered to be a
+ single Contributor. For the purposes of this definition, "control"
+ means (i) the power, direct or indirect, to cause the direction or
+ management of such entity, whether by contract or otherwise, or
+ (ii) ownership of fifty percent (50%) or more of the outstanding
+ shares, or (iii) beneficial ownership of such entity.
+
+ "Contribution" shall mean the code, documentation or other original
+ works of authorship expressly identified in Schedule B, as well as
+ any original work of authorship, including
+ any modifications or additions to an existing work, that is intentionally
+ submitted by You to the Copyright Holders for inclusion in, or
+ documentation of, any of the products owned or managed by the
+ Copyright Holders (the "Work"). For the purposes of this definition,
+ "submitted" means any form of electronic, verbal, or written
+ communication sent to the Copyright Holders or its representatives,
+ including but not limited to communication on electronic mailing
+ lists, source code control systems, and issue tracking systems
+ that are managed by, or on behalf of, the Copyright Holders for the
+ purpose of discussing and improving the Work, but excluding
+ communication that is conspicuously marked or otherwise designated
+ in writing by You as "Not a Contribution."
+
+ 2. Grant of Copyright License. Subject to the terms and conditions
+ of this Agreement, You hereby grant to the Copyright Holders and to
+ recipients of software distributed by the Copyright Holders a
+ perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+ irrevocable copyright license to reproduce, prepare derivative works
+ of, publicly display, publicly perform, sublicense, and distribute
+ Your Contributions and such derivative works.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this Agreement, You hereby grant to the Copyright Holders and to
+ recipients of software distributed by the Copyright Holders
+ a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+ irrevocable (except as stated in this section) patent license
+ to make, have made, use, offer to sell, sell, import, and otherwise
+ transfer the Work, where such license applies only to those
+ patent claims licensable by You that are necessarily infringed
+ by Your Contribution(s) alone or by combination of Your Contribution(s)
+ with the Work to which such Contribution(s) were submitted.
+ If any entity institutes patent litigation against You or any
+ other entity (including a cross-claim or counterclaim in a lawsuit)
+ alleging that your Contribution, or the Work to which you have
+ contributed, constitutes direct or contributory patent infringement,
+ then any patent licenses granted to that entity under this Agreement
+ for that Contribution or Work shall terminate as of the date such
+ litigation is filed.
+
+ 4. You represent that You are legally entitled to grant the above
+ license. You represent further that each employee of the
+ Corporation designated on Schedule A below (or in a subsequent
+ written modification to that Schedule) is authorized to submit
+ Contributions on behalf of the Corporation.
+
+ 5. You represent that each of Your Contributions is Your original
+ creation (see section 7 for submissions on behalf of others).
+
+ 6. You are not expected to provide support for Your Contributions,
+ except to the extent You desire to provide support. You may provide
+ support for free, for a fee, or not at all. Unless required by
+ applicable law or agreed to in writing, You provide Your
+ Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied, including, without
+ limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
+ MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
+
+ 7. Should You wish to submit work that is not Your original creation,
+ You may submit it to the Copyright Holders separately from any
+ Contribution, identifying the complete details of its source and
+ of any license or other restriction (including, but not limited
+ to, related patents, trademarks, and license agreements) of which
+ you are personally aware, and conspicuously marking the work as
+ "Submitted on behalf of a third-party: [named here]".
+
+ 8. It is your responsibility to notify the Copyright Holders when any change
+ is required to the list of designated employees authorized to submit
+ Contributions on behalf of the Corporation, or to the Corporation's
+ Point of Contact with the Copyright Holders.
+
+
+
+ Please sign: __________________________________ Date: _______________
+
+ Title: __________________________________
+
+ Corporation: __________________________________
+
+
+
+
+Schedule A
+
+ [Initial list of designated employees. NB: authorization is not
+ tied to particular Contributions.]
+
+
+
+
+
+
+Schedule B
+
+ [Identification of optional concurrent software grant. Would be
+ left blank or omitted if there is no concurrent software grant.]
+
+
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
new file mode 100644
index 000000000..2a718d63d
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md b/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md
new file mode 100644
index 000000000..58d90402e
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md
@@ -0,0 +1,34 @@
+# NVIDIA GPU Monitoring Tools
+
+## NVML Go Bindings
+
+[NVIDIA Management Library (NVML)](https://developer.nvidia.com/nvidia-management-library-nvml) is a C-based API for monitoring and managing NVIDIA GPU devices.
+NVML go bindings are taken from [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) with some improvements and additions. NVML headers are also added to the package to make it easy to use and build.
+
+### NVML Samples
+Three [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/nvml/README.md) are included to demonstrate how to use the NVML API.
+
+
+## DCGM Go Bindings
+
+[NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting.
+
+DCGM go bindings makes administering and monitoring containerized GPU applications easy.
+
+### DCGM Samples
+
+DCGM can be run in different modes, seven [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/README.md) and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) are included for showing how to use the DCGM API and run it in different modes.
+
+
+## DCGM exporter
+
+GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a simple shell script that starts nv-hostengine, reads GPU metrics every 1 second and converts it to a standard Prometheus format.
+
+Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md).
+
+## Issues and Contributing
+
+A signed copy of the [Contributor License Agreement](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/CLA) needs to be provided to digits@nvidia.com before any change can be accepted.
+
+* Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new)
+* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/)
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
new file mode 100644
index 000000000..4bba89834
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
@@ -0,0 +1,634 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+package nvml
+
+// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
+// #include "nvml_dl.h"
+import "C"
+
+import (
+ "errors"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "sort"
+ "strconv"
+ "strings"
+)
+
+const (
+ szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
+ szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
+ szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
+ szProcs = 32
+ szProcName = 64
+
+ XidCriticalError = C.nvmlEventTypeXidCriticalError
+)
+
+type handle struct{ dev C.nvmlDevice_t }
+type EventSet struct{ set C.nvmlEventSet_t }
+type Event struct {
+ UUID *string
+ Etype uint64
+ Edata uint64
+}
+
+func uintPtr(c C.uint) *uint {
+ i := uint(c)
+ return &i
+}
+
+func uint64Ptr(c C.ulonglong) *uint64 {
+ i := uint64(c)
+ return &i
+}
+
+func stringPtr(c *C.char) *string {
+ s := C.GoString(c)
+ return &s
+}
+
+func errorString(ret C.nvmlReturn_t) error {
+ if ret == C.NVML_SUCCESS {
+ return nil
+ }
+ err := C.GoString(C.nvmlErrorString(ret))
+ return fmt.Errorf("nvml: %v", err)
+}
+
+func init_() error {
+ r := C.nvmlInit_dl()
+ if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
+ return errors.New("could not load NVML library")
+ }
+ return errorString(r)
+}
+
+func NewEventSet() EventSet {
+ var set C.nvmlEventSet_t
+ C.nvmlEventSetCreate(&set)
+
+ return EventSet{set}
+}
+
+func RegisterEvent(es EventSet, event int) error {
+ n, err := deviceGetCount()
+ if err != nil {
+ return err
+ }
+
+ var i uint
+ for i = 0; i < n; i++ {
+ h, err := deviceGetHandleByIndex(i)
+ if err != nil {
+ return err
+ }
+
+ r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
+ if r != C.NVML_SUCCESS {
+ return errorString(r)
+ }
+ }
+
+ return nil
+}
+
+func RegisterEventForDevice(es EventSet, event int, uuid string) error {
+ n, err := deviceGetCount()
+ if err != nil {
+ return err
+ }
+
+ var i uint
+ for i = 0; i < n; i++ {
+ h, err := deviceGetHandleByIndex(i)
+ if err != nil {
+ return err
+ }
+
+ duuid, err := h.deviceGetUUID()
+ if err != nil {
+ return err
+ }
+
+ if *duuid != uuid {
+ continue
+ }
+
+ r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
+ if r != C.NVML_SUCCESS {
+ return errorString(r)
+ }
+
+ return nil
+ }
+
+ return fmt.Errorf("nvml: device not found")
+}
+
+func DeleteEventSet(es EventSet) {
+ C.nvmlEventSetFree(es.set)
+}
+
+func WaitForEvent(es EventSet, timeout uint) (Event, error) {
+ var data C.nvmlEventData_t
+
+ r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
+ uuid, _ := handle{data.device}.deviceGetUUID()
+
+ return Event{
+ UUID: uuid,
+ Etype: uint64(data.eventType),
+ Edata: uint64(data.eventData),
+ },
+ errorString(r)
+}
+
+func shutdown() error {
+ return errorString(C.nvmlShutdown_dl())
+}
+
+func systemGetDriverVersion() (string, error) {
+ var driver [szDriver]C.char
+
+ r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
+ return C.GoString(&driver[0]), errorString(r)
+}
+
+func systemGetProcessName(pid uint) (string, error) {
+ var proc [szProcName]C.char
+
+ r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
+ return C.GoString(&proc[0]), errorString(r)
+}
+
+func deviceGetCount() (uint, error) {
+ var n C.uint
+
+ r := C.nvmlDeviceGetCount(&n)
+ return uint(n), errorString(r)
+}
+
+func deviceGetHandleByIndex(idx uint) (handle, error) {
+ var dev C.nvmlDevice_t
+
+ r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
+ return handle{dev}, errorString(r)
+}
+
+func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
+ var level C.nvmlGpuTopologyLevel_t
+
+ r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
+ if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(C.uint(level)), errorString(r)
+}
+
+func (h handle) deviceGetName() (*string, error) {
+ var name [szName]C.char
+
+ r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return stringPtr(&name[0]), errorString(r)
+}
+
+func (h handle) deviceGetUUID() (*string, error) {
+ var uuid [szUUID]C.char
+
+ r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return stringPtr(&uuid[0]), errorString(r)
+}
+
+func (h handle) deviceGetPciInfo() (*string, error) {
+ var pci C.nvmlPciInfo_t
+
+ r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return stringPtr(&pci.busId[0]), errorString(r)
+}
+
+func (h handle) deviceGetMinorNumber() (*uint, error) {
+ var minor C.uint
+
+ r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(minor), errorString(r)
+}
+
+func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
+ var bar1 C.nvmlBAR1Memory_t
+
+ r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil
+ }
+ return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
+}
+
+func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
+ var power C.uint
+
+ r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(power), errorString(r)
+}
+
+func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
+ var sm, mem C.uint
+
+ r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil
+ }
+ if r == C.NVML_SUCCESS {
+ r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
+ }
+ return uintPtr(sm), uintPtr(mem), errorString(r)
+}
+
+func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
+ var link C.uint
+
+ r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(link), errorString(r)
+}
+
+func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
+ var width C.uint
+
+ r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(width), errorString(r)
+}
+
+func (h handle) deviceGetPowerUsage() (*uint, error) {
+ var power C.uint
+
+ r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(power), errorString(r)
+}
+
+func (h handle) deviceGetTemperature() (*uint, error) {
+ var temp C.uint
+
+ r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(temp), errorString(r)
+}
+
+func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
+ var usage C.nvmlUtilization_t
+
+ r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil
+ }
+ return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
+}
+
+func (h handle) deviceGetEncoderUtilization() (*uint, error) {
+ var usage, sampling C.uint
+
+ r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(usage), errorString(r)
+}
+
+func (h handle) deviceGetDecoderUtilization() (*uint, error) {
+ var usage, sampling C.uint
+
+ r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil
+ }
+ return uintPtr(usage), errorString(r)
+}
+
+func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
+ var mem C.nvmlMemory_t
+
+ r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return
+ }
+
+ err = errorString(r)
+ if r != C.NVML_SUCCESS {
+ return
+ }
+
+ totalMem = uint64Ptr(mem.total)
+ if totalMem != nil {
+ *totalMem /= 1024 * 1024 // MiB
+ }
+
+ devMem = DeviceMemory{
+ Used: uint64Ptr(mem.used),
+ Free: uint64Ptr(mem.free),
+ }
+
+ if devMem.Used != nil {
+ *devMem.Used /= 1024 * 1024 // MiB
+ }
+
+ if devMem.Free != nil {
+ *devMem.Free /= 1024 * 1024 // MiB
+ }
+ return
+}
+
+func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
+ var sm, mem C.uint
+
+ r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil
+ }
+ if r == C.NVML_SUCCESS {
+ r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
+ }
+ return uintPtr(sm), uintPtr(mem), errorString(r)
+}
+
+func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
+ var l1, l2, mem C.ulonglong
+
+ r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+ C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil, nil
+ }
+ if r == C.NVML_SUCCESS {
+ r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+ C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
+ }
+ if r == C.NVML_SUCCESS {
+ r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+ C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
+ }
+ return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
+}
+
+func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
+ var rx, tx C.uint
+
+ r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil
+ }
+ if r == C.NVML_SUCCESS {
+ r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
+ }
+ return uintPtr(rx), uintPtr(tx), errorString(r)
+}
+
+func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
+ var procs [szProcs]C.nvmlProcessInfo_t
+ var count = C.uint(szProcs)
+
+ r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil
+ }
+ n := int(count)
+ pids := make([]uint, n)
+ mems := make([]uint64, n)
+ for i := 0; i < n; i++ {
+ pids[i] = uint(procs[i].pid)
+ mems[i] = uint64(procs[i].usedGpuMemory)
+ }
+ return pids, mems, errorString(r)
+}
+
+func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
+ var procs [szProcs]C.nvmlProcessInfo_t
+ var count = C.uint(szProcs)
+
+ r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return nil, nil, nil
+ }
+ n := int(count)
+ pids := make([]uint, n)
+ mems := make([]uint64, n)
+ for i := 0; i < n; i++ {
+ pids[i] = uint(procs[i].pid)
+ mems[i] = uint64(procs[i].usedGpuMemory)
+ }
+ return pids, mems, errorString(r)
+}
+
+func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
+ cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
+ if err != nil {
+ return nil, err
+ }
+
+ gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
+ if err != nil {
+ return nil, err
+ }
+
+ allPids := make(map[uint]ProcessInfo)
+
+ for i, pid := range cPids {
+ name, err := processName(pid)
+ if err != nil {
+ return nil, err
+ }
+ allPids[pid] = ProcessInfo{
+ PID: pid,
+ Name: name,
+ MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
+ Type: Compute,
+ }
+
+ }
+
+ for i, pid := range gPids {
+ pInfo, exists := allPids[pid]
+ if exists {
+ pInfo.Type = ComputeAndGraphics
+ allPids[pid] = pInfo
+ } else {
+ name, err := processName(pid)
+ if err != nil {
+ return nil, err
+ }
+ allPids[pid] = ProcessInfo{
+ PID: pid,
+ Name: name,
+ MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
+ Type: Graphics,
+ }
+ }
+ }
+
+ var processInfo []ProcessInfo
+ for _, v := range allPids {
+ processInfo = append(processInfo, v)
+ }
+ sort.Slice(processInfo, func(i, j int) bool {
+ return processInfo[i].PID < processInfo[j].PID
+ })
+
+ return processInfo, nil
+}
+
+func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
+ var clocksThrottleReasons C.ulonglong
+
+ r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
+
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return ThrottleReasonUnknown, nil
+ }
+
+ if r != C.NVML_SUCCESS {
+ return ThrottleReasonUnknown, errorString(r)
+ }
+
+ switch clocksThrottleReasons {
+ case C.nvmlClocksThrottleReasonGpuIdle:
+ reason = ThrottleReasonGpuIdle
+ case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
+ reason = ThrottleReasonApplicationsClocksSetting
+ case C.nvmlClocksThrottleReasonSwPowerCap:
+ reason = ThrottleReasonSwPowerCap
+ case C.nvmlClocksThrottleReasonHwSlowdown:
+ reason = ThrottleReasonHwSlowdown
+ case C.nvmlClocksThrottleReasonSyncBoost:
+ reason = ThrottleReasonSyncBoost
+ case C.nvmlClocksThrottleReasonSwThermalSlowdown:
+ reason = ThrottleReasonSwThermalSlowdown
+ case C.nvmlClocksThrottleReasonHwThermalSlowdown:
+ reason = ThrottleReasonHwThermalSlowdown
+ case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
+ reason = ThrottleReasonHwPowerBrakeSlowdown
+ case C.nvmlClocksThrottleReasonDisplayClockSetting:
+ reason = ThrottleReasonDisplayClockSetting
+ case C.nvmlClocksThrottleReasonNone:
+ reason = ThrottleReasonNone
+ }
+ return
+}
+
+func (h handle) getPerformanceState() (PerfState, error) {
+ var pstate C.nvmlPstates_t
+
+ r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
+
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return PerfStateUnknown, nil
+ }
+
+ if r != C.NVML_SUCCESS {
+ return PerfStateUnknown, errorString(r)
+ }
+ return PerfState(pstate), nil
+}
+
+func processName(pid uint) (string, error) {
+ f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
+ d, err := ioutil.ReadFile(f)
+
+ if err != nil {
+ // TOCTOU: process terminated
+ if os.IsNotExist(err) {
+ return "", nil
+ }
+ return "", err
+ }
+ return strings.TrimSuffix(string(d), "\n"), err
+}
+
+func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
+ var mode C.nvmlEnableState_t
+ var buffer C.uint
+
+ r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return
+ }
+
+ if r != C.NVML_SUCCESS {
+ return accountingInfo, errorString(r)
+ }
+
+ r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return
+ }
+
+ if r != C.NVML_SUCCESS {
+ return accountingInfo, errorString(r)
+ }
+
+ accountingInfo = Accounting{
+ Mode: ModeState(mode),
+ BufferSize: uintPtr(buffer),
+ }
+ return
+}
+
+func (h handle) getDisplayInfo() (display Display, err error) {
+ var mode, isActive C.nvmlEnableState_t
+
+ r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return
+ }
+
+ if r != C.NVML_SUCCESS {
+ return display, errorString(r)
+ }
+
+ r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return
+ }
+ if r != C.NVML_SUCCESS {
+ return display, errorString(r)
+ }
+ display = Display{
+ Mode: ModeState(mode),
+ Active: ModeState(isActive),
+ }
+ return
+}
+
+func (h handle) getPeristenceMode() (state ModeState, err error) {
+ var mode C.nvmlEnableState_t
+
+ r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
+ if r == C.NVML_ERROR_NOT_SUPPORTED {
+ return
+ }
+ return ModeState(mode), errorString(r)
+}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
new file mode 100644
index 000000000..f6ec9e8fa
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
@@ -0,0 +1,533 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+package nvml
+
+// #include "nvml_dl.h"
+import "C"
+
+import (
+ "bytes"
+ "errors"
+ "fmt"
+ "io/ioutil"
+ "strconv"
+ "strings"
+)
+
+var (
+ ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
+ ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
+ ErrUnsupportedGPU = errors.New("unsupported GPU device")
+)
+
+type ModeState uint
+
+const (
+ Enabled ModeState = iota
+ Disabled
+)
+
+func (m ModeState) String() string {
+ switch m {
+ case Enabled:
+ return "Enabled"
+ case Disabled:
+ return "Disabled"
+ }
+ return "N/A"
+}
+
+type Display struct {
+ Mode ModeState
+ Active ModeState
+}
+
+type Accounting struct {
+ Mode ModeState
+ BufferSize *uint
+}
+
+type DeviceMode struct {
+ DisplayInfo Display
+ Persistence ModeState
+ AccountingInfo Accounting
+}
+
+type ThrottleReason uint
+
+const (
+ ThrottleReasonGpuIdle ThrottleReason = iota
+ ThrottleReasonApplicationsClocksSetting
+ ThrottleReasonSwPowerCap
+ ThrottleReasonHwSlowdown
+ ThrottleReasonSyncBoost
+ ThrottleReasonSwThermalSlowdown
+ ThrottleReasonHwThermalSlowdown
+ ThrottleReasonHwPowerBrakeSlowdown
+ ThrottleReasonDisplayClockSetting
+ ThrottleReasonNone
+ ThrottleReasonUnknown
+)
+
+func (r ThrottleReason) String() string {
+ switch r {
+ case ThrottleReasonGpuIdle:
+ return "Gpu Idle"
+ case ThrottleReasonApplicationsClocksSetting:
+ return "Applications Clocks Setting"
+ case ThrottleReasonSwPowerCap:
+ return "SW Power Cap"
+ case ThrottleReasonHwSlowdown:
+ return "HW Slowdown"
+ case ThrottleReasonSyncBoost:
+ return "Sync Boost"
+ case ThrottleReasonSwThermalSlowdown:
+ return "SW Thermal Slowdown"
+ case ThrottleReasonHwThermalSlowdown:
+ return "HW Thermal Slowdown"
+ case ThrottleReasonHwPowerBrakeSlowdown:
+ return "HW Power Brake Slowdown"
+ case ThrottleReasonDisplayClockSetting:
+ return "Display Clock Setting"
+ case ThrottleReasonNone:
+ return "No clocks throttling"
+ }
+ return "N/A"
+}
+
+type PerfState uint
+
+const (
+ PerfStateMax = 0
+ PerfStateMin = 15
+ PerfStateUnknown = 32
+)
+
+func (p PerfState) String() string {
+ if p >= PerfStateMax && p <= PerfStateMin {
+ return fmt.Sprintf("P%d", p)
+ }
+ return "Unknown"
+}
+
+type ProcessType uint
+
+const (
+ Compute ProcessType = iota
+ Graphics
+ ComputeAndGraphics
+)
+
+func (t ProcessType) String() string {
+ typ := "C+G"
+ if t == Compute {
+ typ = "C"
+ } else if t == Graphics {
+ typ = "G"
+ }
+ return typ
+}
+
+type P2PLinkType uint
+
+const (
+ P2PLinkUnknown P2PLinkType = iota
+ P2PLinkCrossCPU
+ P2PLinkSameCPU
+ P2PLinkHostBridge
+ P2PLinkMultiSwitch
+ P2PLinkSingleSwitch
+ P2PLinkSameBoard
+)
+
+type P2PLink struct {
+ BusID string
+ Link P2PLinkType
+}
+
+func (t P2PLinkType) String() string {
+ switch t {
+ case P2PLinkCrossCPU:
+ return "Cross CPU socket"
+ case P2PLinkSameCPU:
+ return "Same CPU socket"
+ case P2PLinkHostBridge:
+ return "Host PCI bridge"
+ case P2PLinkMultiSwitch:
+ return "Multiple PCI switches"
+ case P2PLinkSingleSwitch:
+ return "Single PCI switch"
+ case P2PLinkSameBoard:
+ return "Same board"
+ case P2PLinkUnknown:
+ }
+ return "N/A"
+}
+
+type ClockInfo struct {
+ Cores *uint
+ Memory *uint
+}
+
+type PCIInfo struct {
+ BusID string
+ BAR1 *uint64
+ Bandwidth *uint
+}
+
+type Device struct {
+ handle
+
+ UUID string
+ Path string
+ Model *string
+ Power *uint
+ Memory *uint64
+ CPUAffinity *uint
+ PCI PCIInfo
+ Clocks ClockInfo
+ Topology []P2PLink
+}
+
+type UtilizationInfo struct {
+ GPU *uint
+ Memory *uint
+ Encoder *uint
+ Decoder *uint
+}
+
+type PCIThroughputInfo struct {
+ RX *uint
+ TX *uint
+}
+
+type PCIStatusInfo struct {
+ BAR1Used *uint64
+ Throughput PCIThroughputInfo
+}
+
+type ECCErrorsInfo struct {
+ L1Cache *uint64
+ L2Cache *uint64
+ Device *uint64
+}
+
+type DeviceMemory struct {
+ Used *uint64
+ Free *uint64
+}
+
+type MemoryInfo struct {
+ Global DeviceMemory
+ ECCErrors ECCErrorsInfo
+}
+
+type ProcessInfo struct {
+ PID uint
+ Name string
+ MemoryUsed uint64
+ Type ProcessType
+}
+
+type DeviceStatus struct {
+ Power *uint
+ Temperature *uint
+ Utilization UtilizationInfo
+ Memory MemoryInfo
+ Clocks ClockInfo
+ PCI PCIStatusInfo
+ Processes []ProcessInfo
+ Throttle ThrottleReason
+ Performance PerfState
+}
+
+func assert(err error) {
+ if err != nil {
+ panic(err)
+ }
+}
+
+func Init() error {
+ return init_()
+}
+
+func Shutdown() error {
+ return shutdown()
+}
+
+func GetDeviceCount() (uint, error) {
+ return deviceGetCount()
+}
+
+func GetDriverVersion() (string, error) {
+ return systemGetDriverVersion()
+}
+
+func numaNode(busid string) (uint, error) {
+ // discard leading zeros of busid
+ b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
+ if err != nil {
+ // XXX report node 0 if NUMA support isn't enabled
+ return 0, nil
+ }
+ node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
+ if err != nil {
+ return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
+ }
+ if node < 0 {
+ node = 0 // XXX report node 0 instead of NUMA_NO_NODE
+ }
+ return uint(node), nil
+}
+
+func pciBandwidth(gen, width *uint) *uint {
+ m := map[uint]uint{
+ 1: 250, // MB/s
+ 2: 500,
+ 3: 985,
+ 4: 1969,
+ }
+ if gen == nil || width == nil {
+ return nil
+ }
+ bw := m[*gen] * *width
+ return &bw
+}
+
+func NewDevice(idx uint) (device *Device, err error) {
+ defer func() {
+ if r := recover(); r != nil {
+ err = r.(error)
+ }
+ }()
+
+ h, err := deviceGetHandleByIndex(idx)
+ assert(err)
+ model, err := h.deviceGetName()
+ assert(err)
+ uuid, err := h.deviceGetUUID()
+ assert(err)
+ minor, err := h.deviceGetMinorNumber()
+ assert(err)
+ power, err := h.deviceGetPowerManagementLimit()
+ assert(err)
+ totalMem, _, err := h.deviceGetMemoryInfo()
+ assert(err)
+ busid, err := h.deviceGetPciInfo()
+ assert(err)
+ bar1, _, err := h.deviceGetBAR1MemoryInfo()
+ assert(err)
+ pcig, err := h.deviceGetMaxPcieLinkGeneration()
+ assert(err)
+ pciw, err := h.deviceGetMaxPcieLinkWidth()
+ assert(err)
+ ccore, cmem, err := h.deviceGetMaxClockInfo()
+ assert(err)
+
+ if minor == nil || busid == nil || uuid == nil {
+ return nil, ErrUnsupportedGPU
+ }
+ path := fmt.Sprintf("/dev/nvidia%d", *minor)
+ node, err := numaNode(*busid)
+ assert(err)
+
+ device = &Device{
+ handle: h,
+ UUID: *uuid,
+ Path: path,
+ Model: model,
+ Power: power,
+ Memory: totalMem,
+ CPUAffinity: &node,
+ PCI: PCIInfo{
+ BusID: *busid,
+ BAR1: bar1,
+ Bandwidth: pciBandwidth(pcig, pciw), // MB/s
+ },
+ Clocks: ClockInfo{
+ Cores: ccore, // MHz
+ Memory: cmem, // MHz
+ },
+ }
+ if power != nil {
+ *device.Power /= 1000 // W
+ }
+ if bar1 != nil {
+ *device.PCI.BAR1 /= 1024 * 1024 // MiB
+ }
+ return
+}
+
+func NewDeviceLite(idx uint) (device *Device, err error) {
+ defer func() {
+ if r := recover(); r != nil {
+ err = r.(error)
+ }
+ }()
+
+ h, err := deviceGetHandleByIndex(idx)
+ assert(err)
+ uuid, err := h.deviceGetUUID()
+ assert(err)
+ minor, err := h.deviceGetMinorNumber()
+ assert(err)
+ busid, err := h.deviceGetPciInfo()
+ assert(err)
+
+ if minor == nil || busid == nil || uuid == nil {
+ return nil, ErrUnsupportedGPU
+ }
+ path := fmt.Sprintf("/dev/nvidia%d", *minor)
+
+ device = &Device{
+ handle: h,
+ UUID: *uuid,
+ Path: path,
+ PCI: PCIInfo{
+ BusID: *busid,
+ },
+ }
+ return
+}
+
+func (d *Device) Status() (status *DeviceStatus, err error) {
+ defer func() {
+ if r := recover(); r != nil {
+ err = r.(error)
+ }
+ }()
+
+ power, err := d.deviceGetPowerUsage()
+ assert(err)
+ temp, err := d.deviceGetTemperature()
+ assert(err)
+ ugpu, umem, err := d.deviceGetUtilizationRates()
+ assert(err)
+ uenc, err := d.deviceGetEncoderUtilization()
+ assert(err)
+ udec, err := d.deviceGetDecoderUtilization()
+ assert(err)
+ _, devMem, err := d.deviceGetMemoryInfo()
+ assert(err)
+ ccore, cmem, err := d.deviceGetClockInfo()
+ assert(err)
+ _, bar1, err := d.deviceGetBAR1MemoryInfo()
+ assert(err)
+ el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
+ assert(err)
+ pcirx, pcitx, err := d.deviceGetPcieThroughput()
+ assert(err)
+ throttle, err := d.getClocksThrottleReasons()
+ assert(err)
+ perfState, err := d.getPerformanceState()
+ assert(err)
+ processInfo, err := d.deviceGetAllRunningProcesses()
+ assert(err)
+
+ status = &DeviceStatus{
+ Power: power,
+ Temperature: temp, // °C
+ Utilization: UtilizationInfo{
+ GPU: ugpu, // %
+ Memory: umem, // %
+ Encoder: uenc, // %
+ Decoder: udec, // %
+ },
+ Memory: MemoryInfo{
+ Global: devMem,
+ ECCErrors: ECCErrorsInfo{
+ L1Cache: el1,
+ L2Cache: el2,
+ Device: emem,
+ },
+ },
+ Clocks: ClockInfo{
+ Cores: ccore, // MHz
+ Memory: cmem, // MHz
+ },
+ PCI: PCIStatusInfo{
+ BAR1Used: bar1,
+ Throughput: PCIThroughputInfo{
+ RX: pcirx,
+ TX: pcitx,
+ },
+ },
+ Throttle: throttle,
+ Performance: perfState,
+ Processes: processInfo,
+ }
+ if power != nil {
+ *status.Power /= 1000 // W
+ }
+ if bar1 != nil {
+ *status.PCI.BAR1Used /= 1024 * 1024 // MiB
+ }
+ if pcirx != nil {
+ *status.PCI.Throughput.RX /= 1000 // MB/s
+ }
+ if pcitx != nil {
+ *status.PCI.Throughput.TX /= 1000 // MB/s
+ }
+ return
+}
+
+func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
+ level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
+ if err != nil || level == nil {
+ return P2PLinkUnknown, err
+ }
+
+ switch *level {
+ case C.NVML_TOPOLOGY_INTERNAL:
+ link = P2PLinkSameBoard
+ case C.NVML_TOPOLOGY_SINGLE:
+ link = P2PLinkSingleSwitch
+ case C.NVML_TOPOLOGY_MULTIPLE:
+ link = P2PLinkMultiSwitch
+ case C.NVML_TOPOLOGY_HOSTBRIDGE:
+ link = P2PLinkHostBridge
+ case C.NVML_TOPOLOGY_CPU:
+ link = P2PLinkSameCPU
+ case C.NVML_TOPOLOGY_SYSTEM:
+ link = P2PLinkCrossCPU
+ default:
+ err = ErrUnsupportedP2PLink
+ }
+ return
+}
+
+func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
+ return d.handle.deviceGetComputeRunningProcesses()
+}
+
+func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
+ return d.handle.deviceGetGraphicsRunningProcesses()
+}
+
+func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
+ return d.handle.deviceGetAllRunningProcesses()
+}
+
+func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
+ defer func() {
+ if r := recover(); r != nil {
+ err = r.(error)
+ }
+ }()
+
+ display, err := d.getDisplayInfo()
+ assert(err)
+
+ p, err := d.getPeristenceMode()
+ assert(err)
+
+ accounting, err := d.getAccountingInfo()
+ assert(err)
+
+ mode = &DeviceMode{
+ DisplayInfo: display,
+ Persistence: p,
+ AccountingInfo: accounting,
+ }
+ return
+}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
new file mode 100644
index 000000000..60185dac2
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
@@ -0,0 +1,5871 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws. Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users. This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+/*
+NVML API Reference
+
+The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and
+managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building
+3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi
+tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads.
+
+API Documentation
+
+Supported platforms:
+- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit
+- Linux: 32-bit and 64-bit
+- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5
+
+Supported products:
+- Full Support
+ - All Tesla products, starting with the Fermi architecture
+ - All Quadro products, starting with the Fermi architecture
+ - All GRID products, starting with the Kepler architecture
+ - Selected GeForce Titan products
+- Limited Support
+ - All Geforce products, starting with the Fermi architecture
+
+The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is
+not be added to the system path by default. To dynamically link to NVML, add this path to the PATH
+environmental variable. To dynamically load NVML, call LoadLibrary with this path.
+
+On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit
+and 64 bit NVML libraries will be installed.
+
+Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html
+*/
+
+#ifndef __nvml_nvml_h__
+#define __nvml_nvml_h__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On Windows, set up methods for DLL export
+ * define NVML_STATIC_IMPORT when using nvml_loader library
+ */
+#if defined _WINDOWS
+ #if !defined NVML_STATIC_IMPORT
+ #if defined NVML_LIB_EXPORT
+ #define DECLDIR __declspec(dllexport)
+ #else
+ #define DECLDIR __declspec(dllimport)
+ #endif
+ #else
+ #define DECLDIR
+ #endif
+#else
+ #define DECLDIR
+#endif
+
+/**
+ * NVML API versioning support
+ */
+#define NVML_API_VERSION 9
+#define NVML_API_VERSION_STR "9"
+#define nvmlInit nvmlInit_v2
+#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3
+#define nvmlDeviceGetCount nvmlDeviceGetCount_v2
+#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2
+#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2
+#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2
+#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceStructs Device Structs
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Special constant that some fields take when they are not available.
+ * Used when only part of the struct is not available.
+ *
+ * Each structure explicitly states when to check for this value.
+ */
+#define NVML_VALUE_NOT_AVAILABLE (-1)
+
+typedef struct nvmlDevice_st* nvmlDevice_t;
+
+/**
+ * Buffer size guaranteed to be large enough for pci bus id
+ */
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32
+
+/**
+ * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy
+ */
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16
+
+/**
+ * PCI information about a GPU device.
+ */
+typedef struct nvmlPciInfo_st
+{
+ char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator)
+ unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff
+ unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
+ unsigned int device; //!< The device's id on the bus, 0 to 31
+ unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
+
+ // Added in NVML 2.285 API
+ unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
+
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator)
+} nvmlPciInfo_t;
+
+/**
+ * Detailed ECC error counts for a device.
+ *
+ * @deprecated Different GPU families can have different memory error counters
+ * See \ref nvmlDeviceGetMemoryErrorCounter
+ */
+typedef struct nvmlEccErrorCounts_st
+{
+ unsigned long long l1Cache; //!< L1 cache errors
+ unsigned long long l2Cache; //!< L2 cache errors
+ unsigned long long deviceMemory; //!< Device memory errors
+ unsigned long long registerFile; //!< Register file errors
+} nvmlEccErrorCounts_t;
+
+/**
+ * Utilization information for a device.
+ * Each sample period may be between 1 second and 1/6 second, depending on the product being queried.
+ */
+typedef struct nvmlUtilization_st
+{
+ unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
+ unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written
+} nvmlUtilization_t;
+
+/**
+ * Memory allocation information for a device.
+ */
+typedef struct nvmlMemory_st
+{
+ unsigned long long total; //!< Total installed FB memory (in bytes)
+ unsigned long long free; //!< Unallocated FB memory (in bytes)
+ unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+
+/**
+ * BAR1 Memory allocation Information for a device
+ */
+typedef struct nvmlBAR1Memory_st
+{
+ unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes)
+ unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes)
+ unsigned long long bar1Used; //!< Allocated Used Memory (in bytes)
+}nvmlBAR1Memory_t;
+
+/**
+ * Information about running compute processes on the GPU
+ */
+typedef struct nvmlProcessInfo_st
+{
+ unsigned int pid; //!< Process ID
+ unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes.
+ //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
+ //! because Windows KMD manages all the memory and not the NVIDIA driver
+} nvmlProcessInfo_t;
+
+/**
+ * Enum to represent type of bridge chip
+ */
+typedef enum nvmlBridgeChipType_enum
+{
+ NVML_BRIDGE_CHIP_PLX = 0,
+ NVML_BRIDGE_CHIP_BRO4 = 1
+}nvmlBridgeChipType_t;
+
+/**
+ * Maximum number of NvLink links supported
+ */
+#define NVML_NVLINK_MAX_LINKS 6
+
+/**
+ * Enum to represent the NvLink utilization counter packet units
+ */
+typedef enum nvmlNvLinkUtilizationCountUnits_enum
+{
+ NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles
+ NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets
+ NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes
+
+ // this must be last
+ NVML_NVLINK_COUNTER_UNIT_COUNT
+} nvmlNvLinkUtilizationCountUnits_t;
+
+/**
+ * Enum to represent the NvLink utilization counter packet types to count
+ * ** this is ONLY applicable with the units as packets or bytes
+ * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t
+ * ** all packet filter descriptions are target GPU centric
+ * ** these can be "OR'd" together
+ */
+typedef enum nvmlNvLinkUtilizationCountPktTypes_enum
+{
+ NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets
+ NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets
+ NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets
+ NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests
+ NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests
+ NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests
+ NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data
+ NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data
+ NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets
+} nvmlNvLinkUtilizationCountPktTypes_t;
+
+/**
+ * Struct to define the NVLINK counter controls
+ */
+typedef struct nvmlNvLinkUtilizationControl_st
+{
+ nvmlNvLinkUtilizationCountUnits_t units;
+ nvmlNvLinkUtilizationCountPktTypes_t pktfilter;
+} nvmlNvLinkUtilizationControl_t;
+
+/**
+ * Enum to represent NvLink queryable capabilities
+ */
+typedef enum nvmlNvLinkCapability_enum
+{
+ NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported
+ NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported
+ NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported
+ NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported
+ NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link
+ NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device
+ // should be last
+ NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+/**
+ * Enum to represent NvLink queryable error counters
+ */
+typedef enum nvmlNvLinkErrorCounter_enum
+{
+ NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter
+ NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter
+ NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter
+ NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter
+
+ // this must be last
+ NVML_NVLINK_ERROR_COUNT
+} nvmlNvLinkErrorCounter_t;
+
+/**
+ * Represents level relationships within a system between two GPUs
+ * The enums are spaced to allow for future relationships
+ */
+typedef enum nvmlGpuLevel_enum
+{
+ NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80
+ NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch
+ NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge
+ NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge
+ NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges
+ NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system
+
+ // there is purposefully no COUNT here because of the need for spacing above
+} nvmlGpuTopologyLevel_t;
+
+/* Compatibility for CPU->NODE renaming */
+#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE
+
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum
+{
+ NVML_P2P_STATUS_OK = 0,
+ NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+ NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+ NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+ NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+ NVML_P2P_STATUS_NOT_SUPPORTED,
+ NVML_P2P_STATUS_UNKNOWN
+
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum
+{
+ NVML_P2P_CAPS_INDEX_READ = 0,
+ NVML_P2P_CAPS_INDEX_WRITE,
+ NVML_P2P_CAPS_INDEX_NVLINK,
+ NVML_P2P_CAPS_INDEX_ATOMICS,
+ NVML_P2P_CAPS_INDEX_PROP,
+ NVML_P2P_CAPS_INDEX_UNKNOWN
+}nvmlGpuP2PCapsIndex_t;
+
+/**
+ * Maximum limit on Physical Bridges per Board
+ */
+#define NVML_MAX_PHYSICAL_BRIDGE (128)
+
+/**
+ * Information about the Bridge Chip Firmware
+ */
+typedef struct nvmlBridgeChipInfo_st
+{
+ nvmlBridgeChipType_t type; //!< Type of Bridge Chip
+ unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable
+}nvmlBridgeChipInfo_t;
+
+/**
+ * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate
+ * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth.
+ */
+typedef struct nvmlBridgeChipHierarchy_st
+{
+ unsigned char bridgeCount; //!< Number of Bridge Chips on the Board
+ nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board
+}nvmlBridgeChipHierarchy_t;
+
+/**
+ * Represents Type of Sampling Event
+ */
+typedef enum nvmlSamplingType_enum
+{
+ NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU
+ NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU
+ NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written
+ NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy
+ NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy
+ NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples
+ NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples
+
+ // Keep this last
+ NVML_SAMPLINGTYPE_COUNT
+}nvmlSamplingType_t;
+
+/**
+ * Represents the queryable PCIe utilization counters
+ */
+typedef enum nvmlPcieUtilCounter_enum
+{
+ NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity
+ NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity
+
+ // Keep this last
+ NVML_PCIE_UTIL_COUNT
+} nvmlPcieUtilCounter_t;
+
+/**
+ * Represents the type for sample value returned
+ */
+typedef enum nvmlValueType_enum
+{
+ NVML_VALUE_TYPE_DOUBLE = 0,
+ NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+ NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+ NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+ NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
+
+ // Keep this last
+ NVML_VALUE_TYPE_COUNT
+}nvmlValueType_t;
+
+
+/**
+ * Union to represent different types of Value
+ */
+typedef union nvmlValue_st
+{
+ double dVal; //!< If the value is double
+ unsigned int uiVal; //!< If the value is unsigned int
+ unsigned long ulVal; //!< If the value is unsigned long
+ unsigned long long ullVal; //!< If the value is unsigned long long
+ signed long long sllVal; //!< If the value is signed long long
+}nvmlValue_t;
+
+/**
+ * Information for Sample
+ */
+typedef struct nvmlSample_st
+{
+ unsigned long long timeStamp; //!< CPU Timestamp in microseconds
+ nvmlValue_t sampleValue; //!< Sample Value
+}nvmlSample_t;
+
+/**
+ * Represents type of perf policy for which violation times can be queried
+ */
+typedef enum nvmlPerfPolicyType_enum
+{
+ NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks
+ NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks
+ NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks
+ NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks
+ NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks
+ NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks
+
+ NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above)
+ NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks
+
+ // Keep this last
+ NVML_PERF_POLICY_COUNT
+}nvmlPerfPolicyType_t;
+
+/**
+ * Struct to hold perf policy violation status data
+ */
+typedef struct nvmlViolationTime_st
+{
+ unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds
+ unsigned long long violationTime; //!< violationTime in Nanoseconds
+}nvmlViolationTime_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceEnumvs Device Enums
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Generic enable/disable enum.
+ */
+typedef enum nvmlEnableState_enum
+{
+ NVML_FEATURE_DISABLED = 0, //!< Feature disabled
+ NVML_FEATURE_ENABLED = 1 //!< Feature enabled
+} nvmlEnableState_t;
+
+//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details.
+#define nvmlFlagDefault 0x00
+//! Generic flag used to force some behavior. See description of particular functions for details.
+#define nvmlFlagForce 0x01
+
+/**
+ * * The Brand of the GPU
+ * */
+typedef enum nvmlBrandType_enum
+{
+ NVML_BRAND_UNKNOWN = 0,
+ NVML_BRAND_QUADRO = 1,
+ NVML_BRAND_TESLA = 2,
+ NVML_BRAND_NVS = 3,
+ NVML_BRAND_GRID = 4,
+ NVML_BRAND_GEFORCE = 5,
+ NVML_BRAND_TITAN = 6,
+
+ // Keep this last
+ NVML_BRAND_COUNT
+} nvmlBrandType_t;
+
+/**
+ * Temperature thresholds.
+ */
+typedef enum nvmlTemperatureThresholds_enum
+{
+ NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down
+ // for HW protection
+ NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown
+ NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown
+ NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock
+ // Keep this last
+ NVML_TEMPERATURE_THRESHOLD_COUNT
+} nvmlTemperatureThresholds_t;
+
+/**
+ * Temperature sensors.
+ */
+typedef enum nvmlTemperatureSensors_enum
+{
+ NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die
+
+ // Keep this last
+ NVML_TEMPERATURE_COUNT
+} nvmlTemperatureSensors_t;
+
+/**
+ * Compute mode.
+ *
+ * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
+ * Earlier CUDA versions supported a single exclusive mode,
+ * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
+ */
+typedef enum nvmlComputeMode_enum
+{
+ NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device
+ NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed
+ NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device
+ NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
+
+ // Keep this last
+ NVML_COMPUTEMODE_COUNT
+} nvmlComputeMode_t;
+
+/**
+ * ECC bit types.
+ *
+ * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type
+ */
+#define nvmlEccBitType_t nvmlMemoryErrorType_t
+
+/**
+ * Single bit ECC errors
+ *
+ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED
+ */
+#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED
+
+/**
+ * Double bit ECC errors
+ *
+ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED
+ */
+#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED
+
+/**
+ * Memory error types
+ */
+typedef enum nvmlMemoryErrorType_enum
+{
+ /**
+ * A memory error that was corrected
+ *
+ * For ECC errors, these are single bit errors
+ * For Texture memory, these are errors fixed by resend
+ */
+ NVML_MEMORY_ERROR_TYPE_CORRECTED = 0,
+ /**
+ * A memory error that was not corrected
+ *
+ * For ECC errors, these are double bit errors
+ * For Texture memory, these are errors where the resend fails
+ */
+ NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1,
+
+
+ // Keep this last
+ NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types
+
+} nvmlMemoryErrorType_t;
+
+/**
+ * ECC counter types.
+ *
+ * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent.
+ * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver
+ * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app
+ * is run.
+ */
+typedef enum nvmlEccCounterType_enum
+{
+ NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads.
+ NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device)
+
+ // Keep this last
+ NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types
+} nvmlEccCounterType_t;
+
+/**
+ * Clock types.
+ *
+ * All speeds are in Mhz.
+ */
+typedef enum nvmlClockType_enum
+{
+ NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain
+ NVML_CLOCK_SM = 1, //!< SM clock domain
+ NVML_CLOCK_MEM = 2, //!< Memory clock domain
+ NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain
+
+ // Keep this last
+ NVML_CLOCK_COUNT //usedGpuMemory is not supported
+
+
+ unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if
+ //!< the process is not terminated
+
+ unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process
+
+ unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated)
+
+ unsigned int reserved[5]; //!< Reserved for future use
+} nvmlAccountingStats_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlVgpuConstants Vgpu Constants
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense
+ */
+#define NVML_GRID_LICENSE_BUFFER_SIZE 128
+
+#define NVML_VGPU_NAME_BUFFER_SIZE 64
+
+#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3
+
+/*!
+ * Macros for pGPU's virtualization capabilities bitfield.
+ */
+#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0
+#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0
+#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlVgpuEnum Vgpu Enum
+ * @{
+ */
+/***************************************************************************************************/
+
+/*!
+ * Types of VM identifiers
+ */
+typedef enum nvmlVgpuVmIdType {
+ NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID
+ NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID
+} nvmlVgpuVmIdType_t;
+
+// vGPU GUEST info state.
+typedef enum nvmlVgpuGuestInfoState_enum
+{
+ NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //= 0 and < \a unitCount
+ * @param unit Reference in which to return the unit handle
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a unit has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit);
+
+/**
+ * Retrieves the static information associated with a unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlUnitInfo_t for details on available unit info.
+ *
+ * @param unit The identifier of the target unit
+ * @param info Reference in which to return the unit information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a info has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info);
+
+/**
+ * Retrieves the LED state associated with this unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlLedState_t for details on allowed states.
+ *
+ * @param unit The identifier of the target unit
+ * @param state Reference in which to return the current LED state
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a state has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlUnitSetLedState()
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state);
+
+/**
+ * Retrieves the PSU stats for the unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlPSUInfo_t for details on available PSU info.
+ *
+ * @param unit The identifier of the target unit
+ * @param psu Reference in which to return the PSU information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a psu has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu);
+
+/**
+ * Retrieves the temperature readings for the unit, in degrees C.
+ *
+ * For S-class products.
+ *
+ * Depending on the product, readings may be available for intake (type=0),
+ * exhaust (type=1) and board (type=2).
+ *
+ * @param unit The identifier of the target unit
+ * @param type The type of reading to take
+ * @param temp Reference in which to return the intake temperature
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a temp has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp);
+
+/**
+ * Retrieves the fan speed readings for the unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info.
+ *
+ * @param unit The identifier of the target unit
+ * @param fanSpeeds Reference in which to return the fan speed information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a fanSpeeds has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds);
+
+/**
+ * Retrieves the set of GPU devices that are attached to the specified unit.
+ *
+ * For S-class products.
+ *
+ * The \a deviceCount argument is expected to be set to the size of the input \a devices array.
+ *
+ * @param unit The identifier of the target unit
+ * @param deviceCount Reference in which to provide the \a devices array size, and
+ * to return the number of attached GPU devices
+ * @param devices Reference in which to return the references to the attached GPU devices
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices);
+
+/**
+ * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
+ *
+ * For S-class products.
+ *
+ * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array.
+ * The HIC must be connected to an S-class system for it to be reported by this function.
+ *
+ * @param hwbcCount Size of hwbcEntries array
+ * @param hwbcEntries Array holding information about hwbc
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries);
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceQueries Device Queries
+ * This chapter describes that queries that NVML can perform against each device.
+ * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by
+ * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(),
+ * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID().
+ * @{
+ */
+/***************************************************************************************************/
+
+ /**
+ * Retrieves the number of compute devices in the system. A compute device is a single GPU.
+ *
+ * For all products.
+ *
+ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
+ * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
+ * Update your code to handle this error, or use NVML 4.304 or older nvml header file.
+ * For backward binary compatibility reasons _v1 version of the API is still present in the shared
+ * library.
+ * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
+ *
+ * @param deviceCount Reference in which to return the number of accessible devices
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a deviceCount has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount);
+
+/**
+ * Acquire the handle for a particular device, based on its index.
+ *
+ * For all products.
+ *
+ * Valid indices are derived from the \a accessibleDevices count returned by
+ * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices
+ * are 0 and 1, corresponding to GPU 0 and GPU 1.
+ *
+ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
+ * is recommended that devices be looked up by their PCI ids or UUID. See
+ * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId().
+ *
+ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs if:
+ * - The target GPU is an SLI slave
+ *
+ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
+ * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
+ * Update your code to handle this error, or use NVML 4.304 or older nvml header file.
+ * For backward binary compatibility reasons _v1 version of the API is still present in the shared
+ * library.
+ * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
+ *
+ * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index.
+ * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't
+ * need to worry about that.
+ *
+ * @param index The index of the target GPU, >= 0 and < \a accessibleDevices
+ * @param device Reference in which to return the device handle
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a device has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device
+ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetIndex
+ * @see nvmlDeviceGetCount
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its board serial number.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * This number corresponds to the value printed directly on the board, and to the value returned by
+ * \ref nvmlDeviceGetSerial().
+ *
+ * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor
+ * of \ref nvmlDeviceGetHandleByUUID.
+ * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT.
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs as it searches for the target GPU
+ *
+ * @param serial The board serial number of the target GPU
+ * @param device Reference in which to return the device handle
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a device has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one
+ * device has the same serial (dual GPU boards)
+ * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system
+ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetSerial
+ * @see nvmlDeviceGetHandleByUUID
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device.
+ *
+ * For all products.
+ *
+ * @param uuid The UUID of the target GPU
+ * @param device Reference in which to return the device handle
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs as it searches for the target GPU
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a device has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null
+ * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system
+ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetUUID
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its PCI bus id.
+ *
+ * For all products.
+ *
+ * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo().
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs if:
+ * - The target GPU is an SLI slave
+ *
+ * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND
+ * instead of NVML_ERROR_NO_PERMISSION.
+ *
+ * @param pciBusId The PCI bus id of the target GPU
+ * @param device Reference in which to return the device handle
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a device has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL
+ * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system
+ * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device
+ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device);
+
+/**
+ * Retrieves the name of this device.
+ *
+ * For all products.
+ *
+ * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not
+ * exceed 64 characters in length (including the NULL terminator). See \ref
+ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
+ *
+ * @param device The identifier of the target device
+ * @param name Reference in which to return the product name
+ * @param length The maximum allowed length of the string returned in \a name
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a name has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length);
+
+/**
+ * Retrieves the brand of this device.
+ *
+ * For all products.
+ *
+ * The type is a member of \ref nvmlBrandType_t defined above.
+ *
+ * @param device The identifier of the target device
+ * @param type Reference in which to return the product brand type
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a name has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type);
+
+/**
+ * Retrieves the NVML index of this device.
+ *
+ * For all products.
+ *
+ * Valid indices are derived from the \a accessibleDevices count returned by
+ * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices
+ * are 0 and 1, corresponding to GPU 0 and GPU 1.
+ *
+ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
+ * is recommended that devices be looked up by their PCI ids or GPU UUID. See
+ * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID().
+ *
+ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
+ *
+ * @param device The identifier of the target device
+ * @param index Reference in which to return the NVML index of the device
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a index has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetHandleByIndex()
+ * @see nvmlDeviceGetCount()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index);
+
+/**
+ * Retrieves the globally unique board serial number associated with this device's board.
+ *
+ * For all products with an inforom.
+ *
+ * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator).
+ * This number matches the serial number tag that is physically attached to the board. See \ref
+ * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE.
+ *
+ * @param device The identifier of the target device
+ * @param serial Reference in which to return the board/module serial number
+ * @param length The maximum allowed length of the string returned in \a serial
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a serial has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length);
+
+/**
+ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device
+ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2,
+ * result[0] = 0x3, result[1] = 0x3
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device The identifier of the target device
+ * @param cpuSetSize The size of the cpuSet array that is safe to access
+ * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per
+ * unsigned long on 64-bit machines, 32 on 32-bit machines
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a cpuAffinity has been filled
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet);
+
+/**
+ * Sets the ideal affinity for the calling thread and device using the guidelines
+ * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0.
+ * Older versions set the affinity for a calling process and all children.
+ * Currently supports up to 64 processors.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device The identifier of the target device
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the calling process has been successfully bound
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+
+/**
+ * Clear all affinity bindings for the calling thread. Note, this is a change as of version
+ * 8.0 as older versions cleared the affinity for a calling process and all children.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device The identifier of the target device
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the calling process has been successfully unbound
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+
+/**
+ * Retrieve the common ancestor for two devices
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param device1 The identifier of the first device
+ * @param device2 The identifier of the second device
+ * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pathInfo has been set
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature
+ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo);
+
+/**
+ * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param device The identifier of the first device
+ * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs
+ * @param count When zero, is set to the number of matching GPUs such that \a deviceArray
+ * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count
+ * number of device handles.
+ * @param deviceArray An array of device handles for GPUs found at \a level
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature
+ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray);
+
+/**
+ * Retrieve the set of GPUs that have a CPU affinity with the given CPU number
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param cpuNumber The CPU number
+ * @param count When zero, is set to the number of matching GPUs such that \a deviceArray
+ * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count
+ * number of device handles.
+ * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature
+ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray);
+
+/**
+ * Retrieve the status for a given p2p capability index between a given pair of GPU
+ *
+ * @param device1 The first device
+ * @param device2 The second device
+ * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2
+ * @param p2pStatus Reference in which to return the status of the \a p2pIndex
+ * between \a device1 and \a device2
+ * @return
+ * - \ref NVML_SUCCESS if \a p2pStatus has been populated
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus);
+
+/**
+ * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string,
+ * that augments the immutable, board serial identifier.
+ *
+ * For all products.
+ *
+ * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products.
+ * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length
+ * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
+ *
+ * @param device The identifier of the target device
+ * @param uuid Reference in which to return the GPU UUID
+ * @param length The maximum allowed length of the string returned in \a uuid
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a uuid has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length);
+
+/**
+ * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for
+ * each GPU will have the form /dev/nvidia[minor number].
+ *
+ * For all products.
+ * Supported only for Linux
+ *
+ * @param device The identifier of the target device
+ * @param minorNumber Reference in which to return the minor number for the device
+ * @return
+ * - \ref NVML_SUCCESS if the minor number is successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber);
+
+/**
+ * Retrieves the the device board part number which is programmed into the board's InfoROM
+ *
+ * For all products.
+ *
+ * @param device Identifier of the target device
+ * @param partNumber Reference to the buffer to return
+ * @param length Length of the buffer reference
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a partNumber has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length);
+
+/**
+ * Retrieves the version information for the device's infoROM object.
+ *
+ * For all products with an inforom.
+ *
+ * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate
+ * ECC counts. The version of the data structures in this memory may change from time to time. It will not
+ * exceed 16 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
+ *
+ * See \ref nvmlInforomObject_t for details on the available infoROM objects.
+ *
+ * @param device The identifier of the target device
+ * @param object The target infoROM object
+ * @param version Reference in which to return the infoROM version
+ * @param length The maximum allowed length of the string returned in \a version
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a version has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetInforomImageVersion
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length);
+
+/**
+ * Retrieves the global infoROM image version
+ *
+ * For all products with an inforom.
+ *
+ * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board
+ * in contrast to infoROM object version which is only an indicator of supported features.
+ * Version string will not exceed 16 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
+ *
+ * @param device The identifier of the target device
+ * @param version Reference in which to return the infoROM image version
+ * @param length The maximum allowed length of the string returned in \a version
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a version has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetInforomVersion
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length);
+
+/**
+ * Retrieves the checksum of the configuration stored in the device's infoROM.
+ *
+ * For all products with an inforom.
+ *
+ * Can be used to make sure that two GPUs have the exact same configuration.
+ * Current checksum takes into account configuration stored in PWR and ECC infoROM objects.
+ * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC)
+ *
+ * @param device The identifier of the target device
+ * @param checksum Reference in which to return the infoROM configuration checksum
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a checksum has been set
+ * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum);
+
+/**
+ * Reads the infoROM from the flash and verifies the checksums.
+ *
+ * For all products with an inforom.
+ *
+ * @param device The identifier of the target device
+ *
+ * @return
+ * - \ref NVML_SUCCESS if infoROM is not corrupted
+ * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device);
+
+/**
+ * Retrieves the display mode for the device.
+ *
+ * For all products.
+ *
+ * This method indicates whether a physical display (e.g. monitor) is currently connected to
+ * any of the device's connectors.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device The identifier of the target device
+ * @param display Reference in which to return the display mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a display has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display);
+
+/**
+ * Retrieves the display active state for the device.
+ *
+ * For all products.
+ *
+ * This method indicates whether a display is initialized on the device.
+ * For example whether X Server is attached to this device and has allocated memory for the screen.
+ *
+ * Display can be active even when no monitor is physically attached.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device The identifier of the target device
+ * @param isActive Reference in which to return the display active state
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a isActive has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive);
+
+/**
+ * Retrieves the persistence mode associated with this device.
+ *
+ * For all products.
+ * For Linux only.
+ *
+ * When driver persistence mode is enabled the driver software state is not torn down when the last
+ * client disconnects. By default this feature is disabled.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device The identifier of the target device
+ * @param mode Reference in which to return the current driver persistence mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a mode has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceSetPersistenceMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Retrieves the PCI attributes of this device.
+ *
+ * For all products.
+ *
+ * See \ref nvmlPciInfo_t for details on the available PCI info.
+ *
+ * @param device The identifier of the target device
+ * @param pci Reference in which to return the PCI info
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pci has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci);
+
+/**
+ * Retrieves the maximum PCIe link generation possible with this device and system
+ *
+ * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will
+ * report is generation 1.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param maxLinkGen Reference in which to return the max PCIe link generation
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a maxLinkGen has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null
+ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen);
+
+/**
+ * Retrieves the maximum PCIe link width possible with this device and system
+ *
+ * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report
+ * a max link width of 8.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param maxLinkWidth Reference in which to return the max PCIe link generation
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null
+ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth);
+
+/**
+ * Retrieves the current PCIe link generation
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param currLinkGen Reference in which to return the current PCIe link generation
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a currLinkGen has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null
+ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen);
+
+/**
+ * Retrieves the current PCIe link width
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param currLinkWidth Reference in which to return the current PCIe link generation
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a currLinkWidth has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null
+ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth);
+
+/**
+ * Retrieve PCIe utilization information.
+ * This function is querying a byte counter over a 20ms interval and thus is the
+ * PCIe throughput over that interval.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * This method is not supported in virtual machines running virtual GPU (vGPU).
+ *
+ * @param device The identifier of the target device
+ * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t
+ * @param value Reference in which to return throughput in KB/s
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a value has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value);
+
+/**
+ * Retrieve the PCIe replay counter.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param value Reference in which to return the counter's value
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a value and \a rollover have been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value or \a rollover are NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value);
+
+/**
+ * Retrieves the current clock speeds for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlClockType_t for details on available clock information.
+ *
+ * @param device The identifier of the target device
+ * @param type Identify which clock domain to query
+ * @param clock Reference in which to return the clock speed in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a clock has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+
+/**
+ * Retrieves the maximum clock speeds for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlClockType_t for details on available clock information.
+ *
+ * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks
+ * by few MHz.
+ *
+ * @param device The identifier of the target device
+ * @param type Identify which clock domain to query
+ * @param clock Reference in which to return the clock speed in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a clock has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+
+/**
+ * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs.
+ * Can be changed using \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param clockType Identify which clock domain to query
+ * @param clockMHz Reference in which to return the clock in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a clockMHz has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Retrieves the default applications clock that GPU boots with or
+ * defaults to after \ref nvmlDeviceResetApplicationsClocks call.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param clockType Identify which clock domain to query
+ * @param clockMHz Reference in which to return the default clock in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a clockMHz has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * \see nvmlDeviceGetApplicationsClock
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Resets the application clock to the default value
+ *
+ * This is the applications clock that will be used after system reboot or driver reload.
+ * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks,
+ * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above
+ * base clocks as thermal limits allow.
+ *
+ * @see nvmlDeviceGetApplicationsClock
+ * @see nvmlDeviceSetApplicationsClocks
+ *
+ * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ *
+ * @param device The identifier of the target device
+ *
+ * @return
+ * - \ref NVML_SUCCESS if new settings were successfully set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device);
+
+/**
+ * Retrieves the clock speed for the clock specified by the clock type and clock ID.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param clockType Identify which clock domain to query
+ * @param clockId Identify which clock in the domain to query
+ * @param clockMHz Reference in which to return the clock in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a clockMHz has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz);
+
+/**
+ * Retrieves the customer defined maximum boost clock speed specified by the given clock type.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param clockType Identify which clock domain to query
+ * @param clockMHz Reference in which to return the clock in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a clockMHz has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param count Reference in which to provide the \a clocksMHz array size, and
+ * to return the number of elements
+ * @param clocksMHz Reference in which to return the clock in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of
+ * required elements)
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetSupportedGraphicsClocks
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz);
+
+/**
+ * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param memoryClockMHz Memory clock for which to return possible graphics clocks
+ * @param count Reference in which to provide the \a clocksMHz array size, and
+ * to return the number of elements
+ * @param clocksMHz Reference in which to return the clocks in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetSupportedMemoryClocks
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz);
+
+/**
+ * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow.
+ *
+ * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks.
+ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
+ * behavior.
+ *
+ * @param device The identifier of the target device
+ * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device
+ * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will
+ * revert to when no applications are using the GPU
+ *
+ * @return
+ * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled);
+
+/**
+ * Try to set the current state of Auto Boosted clocks on a device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
+ * rates are desired.
+ *
+ * Non-root users may use this API by default but can be restricted by root from using this API by calling
+ * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS.
+ * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled.
+ *
+ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
+ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
+ * behavior.
+ *
+ * @param device The identifier of the target device
+ * @param enabled What state to try to set Auto Boosted clocks of the target device to
+ *
+ * @return
+ * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled);
+
+/**
+ * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will
+ * return to when no compute running processes (e.g. CUDA application which have an active context) are running
+ *
+ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ * Requires root/admin permissions.
+ *
+ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
+ * rates are desired.
+ *
+ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
+ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
+ * behavior.
+ *
+ * @param device The identifier of the target device
+ * @param enabled What state to try to set default Auto Boosted clocks of the target device to
+ * @param flags Flags that change the default behavior. Currently Unused.
+ *
+ * @return
+ * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state.
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags);
+
+
+/**
+ * Retrieves the intended operating speed of the device's fan.
+ *
+ * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the
+ * output will not match the actual fan speed.
+ *
+ * For all discrete products with dedicated fans.
+ *
+ * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%.
+ *
+ * @param device The identifier of the target device
+ * @param speed Reference in which to return the fan speed percentage
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a speed has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed);
+
+/**
+ * Retrieves the current temperature readings for the device, in degrees C.
+ *
+ * For all products.
+ *
+ * See \ref nvmlTemperatureSensors_t for details on available temperature sensors.
+ *
+ * @param device The identifier of the target device
+ * @param sensorType Flag that indicates which sensor reading to retrieve
+ * @param temp Reference in which to return the temperature reading
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a temp has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp);
+
+/**
+ * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds.
+ *
+ * @param device The identifier of the target device
+ * @param thresholdType The type of threshold value queried
+ * @param temp Reference in which to return the temperature reading
+ * @return
+ * - \ref NVML_SUCCESS if \a temp has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
+
+/**
+ * Retrieves the current performance state for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlPstates_t for details on allowed performance states.
+ *
+ * @param device The identifier of the target device
+ * @param pState Reference in which to return the performance state reading
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pState has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState);
+
+/**
+ * Retrieves current clocks throttling reasons.
+ *
+ * For all fully supported products.
+ *
+ * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once.
+ *
+ * @param device The identifier of the target device
+ * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle
+ * reasons
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlClocksThrottleReasons
+ * @see nvmlDeviceGetSupportedClocksThrottleReasons
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
+
+/**
+ * Retrieves bitmask of supported clocks throttle reasons that can be returned by
+ * \ref nvmlDeviceGetCurrentClocksThrottleReasons
+ *
+ * For all fully supported products.
+ *
+ * This method is not supported in virtual machines running virtual GPU (vGPU).
+ *
+ * @param device The identifier of the target device
+ * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported
+ * clocks throttle reasons
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlClocksThrottleReasons
+ * @see nvmlDeviceGetCurrentClocksThrottleReasons
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
+
+/**
+ * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization.
+ *
+ * Retrieve the current performance state for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlPstates_t for details on allowed performance states.
+ *
+ * @param device The identifier of the target device
+ * @param pState Reference in which to return the performance state reading
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pState has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState);
+
+/**
+ * This API has been deprecated.
+ *
+ * Retrieves the power management mode associated with this device.
+ *
+ * For products from the Fermi family.
+ * - Requires \a NVML_INFOROM_POWER version 3.0 or higher.
+ *
+ * For from the Kepler or newer families.
+ * - Does not require \a NVML_INFOROM_POWER object.
+ *
+ * This flag indicates whether any power management algorithm is currently active on the device. An
+ * enabled state does not necessarily mean the device is being actively throttled -- only that
+ * that the driver will do so if the appropriate conditions are met.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device The identifier of the target device
+ * @param mode Reference in which to return the current power management mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a mode has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Retrieves the power management limit associated with this device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * The power limit defines the upper boundary for the card's power draw. If
+ * the card's total power draw reaches this limit the power management algorithm kicks in.
+ *
+ * This reading is only available if power management mode is supported.
+ * See \ref nvmlDeviceGetPowerManagementMode.
+ *
+ * @param device The identifier of the target device
+ * @param limit Reference in which to return the power management limit in milliwatts
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a limit has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit);
+
+/**
+ * Retrieves information about possible values of power management limits on this device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param minLimit Reference in which to return the minimum power management limit in milliwatts
+ * @param maxLimit Reference in which to return the maximum power management limit in milliwatts
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceSetPowerManagementLimit
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
+
+/**
+ * Retrieves default power management limit on this device, in milliwatts.
+ * Default power management limit is a power management limit that the device boots with.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param defaultLimit Reference in which to return the default power management limit in milliwatts
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a defaultLimit has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit);
+
+/**
+ * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
+ *
+ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode.
+ *
+ * @param device The identifier of the target device
+ * @param power Reference in which to return the power usage information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a power has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power);
+
+/**
+ * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
+ *
+ * For newer than Pascal &tm; fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param energy Reference in which to return the energy consumption information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a energy has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy);
+
+/**
+ * Get the effective power limit that the driver enforces after taking into account all limiters
+ *
+ * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere
+ * This includes the out of band power limit interface
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The device to communicate with
+ * @param limit Reference in which to return the power management limit in milliwatts
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a limit has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit);
+
+/**
+ * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot).
+ *
+ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
+ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
+ * Not supported on Quadro ® and Tesla &tm; C-class products.
+ *
+ * @param device The identifier of the target device
+ * @param current Reference in which to return the current GOM
+ * @param pending Reference in which to return the pending GOM
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a mode has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlGpuOperationMode_t
+ * @see nvmlDeviceSetGpuOperationMode
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending);
+
+/**
+ * Retrieves the amount of used, free and total memory available on the device, in bytes.
+ *
+ * For all products.
+ *
+ * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
+ * Under WDDM most device memory is allocated and managed on startup by Windows.
+ *
+ * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated
+ * by all active channels on the device.
+ *
+ * See \ref nvmlMemory_t for details on available memory info.
+ *
+ * @param device The identifier of the target device
+ * @param memory Reference in which to return the memory information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a memory has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory);
+
+/**
+ * Retrieves the current compute mode for the device.
+ *
+ * For all products.
+ *
+ * See \ref nvmlComputeMode_t for details on allowed compute modes.
+ *
+ * @param device The identifier of the target device
+ * @param mode Reference in which to return the current compute mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a mode has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceSetComputeMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode);
+
+/**
+ * Retrieves the CUDA compute capability of the device.
+ *
+ * For all products.
+ *
+ * Returns the major and minor compute capability version numbers of the
+ * device. The major and minor versions are equivalent to the
+ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and
+ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be
+ * returned by CUDA's cuDeviceGetAttribute().
+ *
+ * @param device The identifier of the target device
+ * @param major Reference in which to return the major CUDA compute capability
+ * @param minor Reference in which to return the minor CUDA compute capability
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a major and \a minor have been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor);
+
+/**
+ * Retrieves the current and pending ECC modes for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ *
+ * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following
+ * the next reboot.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device The identifier of the target device
+ * @param current Reference in which to return the current ECC mode
+ * @param pending Reference in which to return the pending ECC mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a current and \a pending have been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceSetEccMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending);
+
+/**
+ * Retrieves the device boardId from 0-N.
+ * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with
+ * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well.
+ * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across
+ * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and
+ * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will
+ * always return those values but they will always be different from each other).
+ *
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param boardId Reference in which to return the device's board ID
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a boardId has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId);
+
+/**
+ * Retrieves whether the device is on a Multi-GPU Board
+ * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param multiGpuBool Reference in which to return a zero or non-zero value
+ * to indicate whether the device is on a multi GPU board
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a multiGpuBool has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool);
+
+/**
+ * Retrieves the total ECC error counts for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ * Requires ECC Mode to be enabled.
+ *
+ * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of
+ * errors across the entire device.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available error types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.
+ *
+ * @param device The identifier of the target device
+ * @param errorType Flag that specifies the type of the errors.
+ * @param counterType Flag that specifies the counter-type of the errors.
+ * @param eccCounts Reference in which to return the specified ECC errors
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a eccCounts has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceClearEccErrorCounts()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts);
+
+/**
+ * Retrieves the detailed ECC error counts for the device.
+ *
+ * @deprecated This API supports only a fixed set of ECC error locations
+ * On different GPU architectures different locations are supported
+ * See \ref nvmlDeviceGetMemoryErrorCounter
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts.
+ * Requires ECC Mode to be enabled.
+ *
+ * Detailed errors provide separate ECC counts for specific parts of the memory system.
+ *
+ * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
+ * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts.
+ *
+ * @param device The identifier of the target device
+ * @param errorType Flag that specifies the type of the errors.
+ * @param counterType Flag that specifies the counter-type of the errors.
+ * @param eccCounts Reference in which to return the specified ECC errors
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a eccCounts has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceClearEccErrorCounts()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts);
+
+/**
+ * Retrieves the requested memory error counter for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts.
+ *
+ * Only applicable to devices with ECC.
+ *
+ * Requires ECC Mode to be enabled.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
+ * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n
+ *
+ * @param device The identifier of the target device
+ * @param errorType Flag that specifies the type of error.
+ * @param counterType Flag that specifies the counter-type of the errors.
+ * @param locationType Specifies the location of the counter.
+ * @param count Reference in which to return the ECC counter
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a count has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is
+ * invalid, or \a count is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
+ nvmlEccCounterType_t counterType,
+ nvmlMemoryLocation_t locationType, unsigned long long *count);
+
+/**
+ * Retrieves the current utilization rates for the device's major subsystems.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlUtilization_t for details on available utilization rates.
+ *
+ * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
+ * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
+ *
+ * @param device The identifier of the target device
+ * @param utilization Reference in which to return the utilization information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a utilization has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization);
+
+/**
+ * Retrieves the current utilization and sampling size in microseconds for the Encoder
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param utilization Reference to an unsigned int for encoder utilization info
+ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a utilization has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
+
+/**
+ * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param encoderQueryType Type of encoder to query
+ * @param encoderCapacity Reference to an unsigned int for the encoder capacity
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a encoderCapacity is fetched
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType
+ * are invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity);
+
+/**
+ * Retrieves the current encoder statistics for a given device.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param sessionCount Reference to an unsigned int for count of active encoder sessions
+ * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions
+ * @param averageLatency Reference to an unsigned int for encode latency in microseconds
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps,
+ * or \a averageLatency is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount,
+ unsigned int *averageFps, unsigned int *averageLatency);
+
+/**
+ * Retrieves information about active encoder sessions on a target device.
+ *
+ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The
+ * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
+ * written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the active session array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
+ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
+ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param sessionCount Reference to caller supplied array size, and returns the number of sessions.
+ * @param sessionInfos Reference in which to return the session information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a sessionInfos is fetched
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL.
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos);
+
+/**
+ * Retrieves the current utilization and sampling size in microseconds for the Decoder
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param utilization Reference to an unsigned int for decoder utilization info
+ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a utilization has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
+
+/**
+ * Retrieves the current and pending driver model for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * For windows only.
+ *
+ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
+ * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached.
+ *
+ * See \ref nvmlDriverModel_t for details on available driver models.
+ *
+ * @param device The identifier of the target device
+ * @param current Reference in which to return the current driver model
+ * @param pending Reference in which to return the pending driver model
+ *
+ * @return
+ * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceSetDriverModel()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending);
+
+/**
+ * Get VBIOS version of the device.
+ *
+ * For all products.
+ *
+ * The VBIOS version may change from time to time. It will not exceed 32 characters in length
+ * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE.
+ *
+ * @param device The identifier of the target device
+ * @param version Reference to which to return the VBIOS version
+ * @param length The maximum allowed length of the string returned in \a version
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a version has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length);
+
+/**
+ * Get Bridge Chip Information for all the bridge chips on the board.
+ *
+ * For all fully supported products.
+ * Only applicable to multi-GPU products.
+ *
+ * @param device The identifier of the target device
+ * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy
+ *
+ * @return
+ * - \ref NVML_SUCCESS if bridge chip exists
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy);
+
+/**
+ * Get information about processes with a compute context on a device
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * This function returns information only about compute running processes (e.g. CUDA application which have
+ * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
+ *
+ * To query the current number of running compute processes, call this function with *infoCount = 0. The
+ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+ * \a infos is allowed to be NULL.
+ *
+ * The usedGpuMemory field returned is all of the memory used by the application.
+ *
+ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
+ * time. Allocate more space for \a infos table in case new compute processes are spawned.
+ *
+ * @param device The identifier of the target device
+ * @param infoCount Reference in which to provide the \a infos array size, and
+ * to return the number of returned elements
+ * @param infos Reference in which to return the process information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
+ * \a infoCount will contain minimal amount of space necessary for
+ * the call to complete
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see \ref nvmlSystemGetProcessName
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
+
+/**
+ * Get information about processes with a graphics context on a device
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * This function returns information only about graphics based processes
+ * (eg. applications using OpenGL, DirectX)
+ *
+ * To query the current number of running graphics processes, call this function with *infoCount = 0. The
+ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+ * \a infos is allowed to be NULL.
+ *
+ * The usedGpuMemory field returned is all of the memory used by the application.
+ *
+ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
+ * time. Allocate more space for \a infos table in case new graphics processes are spawned.
+ *
+ * @param device The identifier of the target device
+ * @param infoCount Reference in which to provide the \a infos array size, and
+ * to return the number of returned elements
+ * @param infos Reference in which to return the process information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
+ * \a infoCount will contain minimal amount of space necessary for
+ * the call to complete
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see \ref nvmlSystemGetProcessName
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
+
+/**
+ * Check if the GPU devices are on the same physical board.
+ *
+ * For all fully supported products.
+ *
+ * @param device1 The first GPU device
+ * @param device2 The second GPU device
+ * @param onSameBoard Reference in which to return the status.
+ * Non-zero indicates that the GPUs are on the same board.
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a onSameBoard has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard);
+
+/**
+ * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs.
+ * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions.
+ *
+ * For all fully supported products.
+ *
+ * @param device The identifier of the target device
+ * @param apiType Target API type for this operation
+ * @param isRestricted Reference in which to return the current restriction
+ * NVML_FEATURE_ENABLED indicates that the API is root-only
+ * NVML_FEATURE_DISABLED indicates that the API is accessible to all users
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a isRestricted has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support
+ * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is
+ * not supported by the device)
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlRestrictedAPI_t
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted);
+
+/**
+ * Gets recent samples for the GPU.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by
+ * the driver.
+ *
+ * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t.
+ *
+ * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL.
+ * The returned samplesCount will provide the number of samples that can be queried. The user needs to
+ * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t).
+ *
+ * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the
+ * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query
+ * to get more recent samples.
+ *
+ * This method fetches the number of entries which can be accommodated in the provided samples array, and the
+ * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this
+ * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost.
+ *
+ * @param device The identifier for the target device
+ * @param type Type of sampling event
+ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
+ * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t
+ * @param sampleCount Reference to provide the number of elements which can be queried in samples array
+ * @param samples Reference in which samples are returned
+
+ * @return
+ * - \ref NVML_SUCCESS if samples are successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or
+ * reference to \a sampleCount is 0 for non null \a samples
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp,
+ nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
+
+/**
+ * Gets Total, Available and Used size of BAR1 memory.
+ *
+ * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party
+ * devices (peer-to-peer on the PCIE bus).
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param bar1Memory Reference in which BAR1 memory
+ * information is returned.
+ *
+ * @return
+ * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory);
+
+
+/**
+ * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power
+ * or thermal constraints.
+ *
+ * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
+ * difference in violation times at two different reference times gives the indication of GPU throttling event.
+ *
+ * Violation for thermal capping is not supported at this time.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param perfPolicyType Represents Performance policy which can trigger GPU throttling
+ * @param violTime Reference to which violation time related information is returned
+ *
+ *
+ * @return
+ * - \ref NVML_SUCCESS if violation time is successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime);
+
+/**
+ * @}
+ */
+
+/** @addtogroup nvmlAccountingStats
+ * @{
+ */
+
+/**
+ * Queries the state of per process accounting mode.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlDeviceGetAccountingStats for more details.
+ * See \ref nvmlDeviceSetAccountingMode
+ *
+ * @param device The identifier of the target device
+ * @param mode Reference in which to return the current accounting mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the mode has been successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Queries process's accounting stats.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process.
+ * Accounting stats can be queried during life time of the process and after its termination.
+ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and
+ * updated to actual running time after its termination.
+ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
+ * processes.
+ *
+ * See \ref nvmlAccountingStats_t for description of each returned metric.
+ * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids.
+ *
+ * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode.
+ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
+ * queried since they don't contribute to GPU utilization.
+ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
+ *
+ * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU.
+ *
+ * @param device The identifier of the target device
+ * @param pid Process Id of the target process to query stats for
+ * @param stats Reference in which to return the process's accounting stats
+ *
+ * @return
+ * - \ref NVML_SUCCESS if stats have been successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL
+ * - \ref NVML_ERROR_NOT_FOUND if process stats were not found
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetAccountingBufferSize
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats);
+
+/**
+ * Queries list of processes that can be queried for accounting stats. The list of processes returned
+ * can be in running or terminated state.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * To just query the number of processes ready to be queried, call this function with *count = 0 and
+ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
+ *
+ * For more details see \ref nvmlDeviceGetAccountingStats.
+ *
+ * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
+ *
+ * @param device The identifier of the target device
+ * @param count Reference in which to provide the \a pids array size, and
+ * to return the number of elements ready to be queried
+ * @param pids Reference in which to return list of process ids
+ *
+ * @return
+ * - \ref NVML_SUCCESS if pids were successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to
+ * expected value)
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetAccountingBufferSize
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids);
+
+/**
+ * Returns the number of processes that the circular buffer with accounting pids can hold.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * This is the maximum number of processes that accounting information will be stored for before information
+ * about oldest processes will get overwritten by information about new processes.
+ *
+ * @param device The identifier of the target device
+ * @param bufferSize Reference in which to provide the size (in number of elements)
+ * of the circular buffer for accounting stats.
+ *
+ * @return
+ * - \ref NVML_SUCCESS if buffer size was successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetAccountingStats
+ * @see nvmlDeviceGetAccountingPids
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize);
+
+/** @} */
+
+/** @addtogroup nvmlDeviceQueries
+ * @{
+ */
+
+/**
+ * Returns the list of retired pages by source, including pages that are pending retirement
+ * The address information provided from this API is the hardware address of the page that was retired. Note
+ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param cause Filter page addresses by cause of retirement
+ * @param pageCount Reference in which to provide the \a addresses buffer size, and
+ * to return the number of retired pages that match \a cause
+ * Set to 0 to query the size without allocating an \a addresses buffer
+ * @param addresses Buffer to write the page addresses into
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
+ * matching page addresses. \a pageCount is set to the needed size.
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or
+ * \a addresses is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
+ unsigned int *pageCount, unsigned long long *addresses);
+
+/**
+ * Check if any pages are pending retirement and need a reboot to fully retire.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param isPending Reference in which to return the pending status
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a isPending was populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlUnitCommands Unit Commands
+ * This chapter describes NVML operations that change the state of the unit. For S-class products.
+ * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
+ * error code when invoking any of these methods.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Set the LED state for the unit. The LED can be either green (0) or amber (1).
+ *
+ * For S-class products.
+ * Requires root/admin permissions.
+ *
+ * This operation takes effect immediately.
+ *
+ *
+ * Current S-Class products don't provide unique LEDs for each unit. As such, both front
+ * and back LEDs will be toggled in unison regardless of which unit is specified with this command.
+ *
+ * See \ref nvmlLedColor_t for available colors.
+ *
+ * @param unit The identifier of the target unit
+ * @param color The target LED color
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the LED color has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlUnitGetLedState()
+ */
+nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceCommands Device Commands
+ * This chapter describes NVML operations that change the state of the device.
+ * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
+ * error code when invoking any of these methods.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Set the persistence mode for the device.
+ *
+ * For all products.
+ * For Linux only.
+ * Requires root/admin permissions.
+ *
+ * The persistence mode determines whether the GPU driver software is torn down after the last client
+ * exits.
+ *
+ * This operation takes effect immediately. It is not persistent across reboots. After each reboot the
+ * persistence mode is reset to "Disabled".
+ *
+ * See \ref nvmlEnableState_t for available modes.
+ *
+ * @param device The identifier of the target device
+ * @param mode The target persistence mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the persistence mode was set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetPersistenceMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode);
+
+/**
+ * Set the compute mode for the device.
+ *
+ * For all products.
+ * Requires root/admin permissions.
+ *
+ * The compute mode determines whether a GPU can be used for compute operations and whether it can
+ * be shared across contexts.
+ *
+ * This operation takes effect immediately. Under Linux it is not persistent across reboots and
+ * always resets to "Default". Under windows it is persistent.
+ *
+ * Under windows compute mode may only be set to DEFAULT when running in WDDM
+ *
+ * See \ref nvmlComputeMode_t for details on available compute modes.
+ *
+ * @param device The identifier of the target device
+ * @param mode The target compute mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the compute mode was set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetComputeMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode);
+
+/**
+ * Set the ECC mode for the device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ * Requires root/admin permissions.
+ *
+ * The ECC mode determines whether the GPU enables its ECC support.
+ *
+ * This operation takes effect after the next reboot.
+ *
+ * See \ref nvmlEnableState_t for details on available modes.
+ *
+ * @param device The identifier of the target device
+ * @param ecc The target ECC mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the ECC mode was set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetEccMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc);
+
+/**
+ * Clear the ECC error and other memory error counts for the device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts.
+ * Requires root/admin permissions.
+ * Requires ECC Mode to be enabled.
+ *
+ * Sets all of the specified ECC counters to 0, including both detailed and total counts.
+ *
+ * This operation takes effect immediately.
+ *
+ * See \ref nvmlMemoryErrorType_t for details on available counter types.
+ *
+ * @param device The identifier of the target device
+ * @param counterType Flag that indicates which type of errors should be cleared.
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the error counts were cleared
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see
+ * - nvmlDeviceGetDetailedEccErrors()
+ * - nvmlDeviceGetTotalEccErrors()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType);
+
+/**
+ * Set the driver model for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * For windows only.
+ * Requires root/admin permissions.
+ *
+ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
+ * to the device it must run in WDDM mode.
+ *
+ * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce).
+ * This should only be done if the host is subsequently powered down and the display is detached from the device
+ * before the next reboot.
+ *
+ * This operation takes effect after the next reboot.
+ *
+ * Windows driver model may only be set to WDDM when running in DEFAULT compute mode.
+ *
+ * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or
+ * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode.
+ *
+ * See \ref nvmlDriverModel_t for details on available driver models.
+ * See \ref nvmlFlagDefault and \ref nvmlFlagForce
+ *
+ * @param device The identifier of the target device
+ * @param driverModel The target driver model
+ * @param flags Flags that change the default behavior
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the driver model has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetDriverModel()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags);
+
+/**
+ * Set clocks that applications will lock to.
+ *
+ * Sets the clocks that compute and graphics applications will be running at.
+ * e.g. CUDA driver requests these clocks during context creation which means this property
+ * defines clocks at which CUDA applications will be running unless some overspec event
+ * occurs (e.g. over power, over thermal or external HW brake).
+ *
+ * Can be used as a setting to request constant performance.
+ *
+ * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks.
+ *
+ * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call
+ * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting
+ * above the clock value being set.
+ *
+ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks
+ * for details on how to list available clocks combinations.
+ *
+ * After system reboot or driver reload applications clocks go back to their default value.
+ * See \ref nvmlDeviceResetApplicationsClocks.
+ *
+ * @param device The identifier of the target device
+ * @param memClockMHz Requested memory clock in MHz
+ * @param graphicsClockMHz Requested graphics clock in MHz
+ *
+ * @return
+ * - \ref NVML_SUCCESS if new settings were successfully set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz
+ * is not a valid clock combination
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz);
+
+/**
+ * Set new power limit of this device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
+ *
+ * \note Limit is not persistent across reboots or driver unloads.
+ * Enable persistent mode to prevent driver from unloading when no application is using the device.
+ *
+ * @param device The identifier of the target device
+ * @param limit Power management limit in milliwatts to set
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a limit has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceGetPowerManagementLimitConstraints
+ * @see nvmlDeviceGetPowerManagementDefaultLimit
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit);
+
+/**
+ * Sets new GOM. See \a nvmlGpuOperationMode_t for details.
+ *
+ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
+ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
+ * Not supported on Quadro ® and Tesla &tm; C-class products.
+ * Requires root/admin permissions.
+ *
+ * Changing GOMs requires a reboot.
+ * The reboot requirement might be removed in the future.
+ *
+ * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when
+ * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel.
+ *
+ * @param device The identifier of the target device
+ * @param mode Target GOM
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a mode has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlGpuOperationMode_t
+ * @see nvmlDeviceGetGpuOperationMode
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode);
+
+/**
+ * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs.
+ * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs.
+ * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction
+ * to query the current restriction settings.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * @param device The identifier of the target device
+ * @param apiType Target API type for this operation
+ * @param isRestricted The target restriction
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a isRestricted has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support
+ * the feature that api restrictions are being set for (E.G. Enabling/disabling auto
+ * boosted clocks is not supported by the device)
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlRestrictedAPI_t
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted);
+
+/**
+ * @}
+ */
+
+/** @addtogroup nvmlAccountingStats
+ * @{
+ */
+
+/**
+ * Enables or disables per process accounting.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * @note This setting is not persistent and will default to disabled after driver unloads.
+ * Enable persistence mode to be sure the setting doesn't switch off to disabled.
+ *
+ * @note Enabling accounting mode has no negative impact on the GPU performance.
+ *
+ * @note Disabling accounting clears all accounting pids information.
+ *
+ * See \ref nvmlDeviceGetAccountingMode
+ * See \ref nvmlDeviceGetAccountingStats
+ * See \ref nvmlDeviceClearAccountingPids
+ *
+ * @param device The identifier of the target device
+ * @param mode The target accounting mode
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the new mode has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode);
+
+/**
+ * Clears accounting information about all processes that have already terminated.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetAccountingMode
+ * See \ref nvmlDeviceGetAccountingStats
+ * See \ref nvmlDeviceSetAccountingMode
+ *
+ * @param device The identifier of the target device
+ *
+ * @return
+ * - \ref NVML_SUCCESS if accounting information has been cleared
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup NvLink NvLink Methods
+ * This chapter describes methods that NVML can perform on NVLINK enabled devices.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Retrieves the state of the device's NvLink for the link specified
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that
+ * the link is active and NVML_FEATURE_DISABLED indicates it
+ * is inactive
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a isActive has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+
+/**
+ * Retrieves the version of the device's NvLink for the link specified
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ * @param version Requested NvLink version
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a version has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version);
+
+/**
+ * Retrieves the requested capability from the device's NvLink for the link specified
+ * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried
+ * The return value should be treated as a boolean.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried
+ * @param capResult A boolean for the queried capability indicating that feature is available
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a capResult has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+ nvmlNvLinkCapability_t capability, unsigned int *capResult);
+
+/**
+ * Retrieves the PCI information for the remote node on a NvLink link
+ * Note: pciSubSystemId is not filled in this function and is indeterminate
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ * @param pci \a nvmlPciInfo_t of the remote node for the specified link
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pci has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+
+/**
+ * Retrieves the specified error counter value
+ * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ * @param counter Specifies the NvLink counter to be queried
+ * @param counterValue Returned counter value
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a counter has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link,
+ nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue);
+
+/**
+ * Resets all error counters to zero
+ * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the reset is successful
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link);
+
+/**
+ * Set the NVLINK utilization counter control information for the specified counter, 0 or 1.
+ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset
+ * of the counters if the reset parameter is non-zero.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param counter Specifies the counter that should be set (0 or 1).
+ * @param link Specifies the NvLink link to be queried
+ * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set
+ * @param reset Resets the counters on set if non-zero
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the control has been set successfully
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
+ nvmlNvLinkUtilizationControl_t *control, unsigned int reset);
+
+/**
+ * Get the NVLINK utilization counter control information for the specified counter, 0 or 1.
+ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param counter Specifies the counter that should be set (0 or 1).
+ * @param link Specifies the NvLink link to be queried
+ * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the control has been set successfully
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
+ nvmlNvLinkUtilizationControl_t *control);
+
+
+/**
+ * Retrieve the NVLINK utilization counter based on the current control for a specified counter.
+ * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl
+ * before reading the utilization counters as they have no default state
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ * @param counter Specifies the counter that should be read (0 or 1).
+ * @param rxcounter Receive counter return value
+ * @param txcounter Transmit counter return value
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter,
+ unsigned long long *rxcounter, unsigned long long *txcounter);
+
+/**
+ * Freeze the NVLINK utilization counters
+ * Both the receive and transmit counters are operated on by this function
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be queried
+ * @param counter Specifies the counter that should be frozen (0 or 1).
+ * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters
+ * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters
+ *
+ * @return
+ * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link,
+ unsigned int counter, nvmlEnableState_t freeze);
+
+/**
+ * Reset the NVLINK utilization counters
+ * Both the receive and transmit counters are operated on by this function
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param link Specifies the NvLink link to be reset
+ * @param counter Specifies the counter that should be reset (0 or 1)
+ *
+ * @return
+ * - \ref NVML_SUCCESS if counters were successfully reset
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlEvents Event Handling Methods
+ * This chapter describes methods that NVML can perform against each device to register and wait for
+ * some event to occur.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Create an empty set of events.
+ * Event set should be freed by \ref nvmlEventSetFree
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * @param set Reference in which to return the event handle
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the event has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlEventSetFree
+ */
+nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set);
+
+/**
+ * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
+ * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode)
+ *
+ * For Linux only.
+ *
+ * \b IMPORTANT: Operations on \a set are not thread safe
+ *
+ * This call starts recording of events on specific device.
+ * All events that occurred before this call are not recorded.
+ * Checking if some event occurred can be done with \ref nvmlEventSetWait
+ *
+ * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed.
+ * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes
+ * are registered in that case.
+ *
+ * @param device The identifier of the target device
+ * @param eventTypes Bitmask of \ref nvmlEventType to record
+ * @param set Set to which add new event types
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the event has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlEventType
+ * @see nvmlDeviceGetSupportedEventTypes
+ * @see nvmlEventSetWait
+ * @see nvmlEventSetFree
+ */
+nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set);
+
+/**
+ * Returns information about events supported on device
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows.
+ *
+ * @param device The identifier of the target device
+ * @param eventTypes Reference in which to return bitmask of supported events
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the eventTypes has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlEventType
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes);
+
+/**
+ * Waits on events and delivers events
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * If some events are ready to be delivered at the time of the call, function returns immediately.
+ * If there are no events ready to be delivered, function sleeps till event arrives
+ * but not longer than specified timeout. This function in certain conditions can return before
+ * specified timeout passes (e.g. when interrupt arrives)
+ *
+ * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple
+ * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all
+ * xid error events.
+ *
+ * @param set Reference to set of events to wait on
+ * @param data Reference in which to return event data
+ * @param timeoutms Maximum amount of wait time in milliseconds for registered event
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the data has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL
+ * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived
+ * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlEventType
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms);
+
+/**
+ * Releases events in the set
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param set Reference to events to be released
+ *
+ * @return
+ * - \ref NVML_SUCCESS if the event has been successfully released
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ *
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlZPI Drain states
+ * This chapter describes methods that NVML can perform against each device to control their drain state
+ * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to
+ * power on/off GPUs, enable robust reset scenarios, etc.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests.
+ * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before
+ * this call is made.
+ * Must be called as administrator.
+ * For Linux only.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo The PCI address of the GPU drain state to be modified
+ * @param newState The drain state that should be entered, see \ref nvmlEnableState_t
+ *
+ * @return
+ * - \ref NVML_SUCCESS if counters were successfully reset
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation
+ * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState);
+
+/**
+ * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining
+ * state.
+ * For Linux only.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo The PCI address of the GPU drain state to be queried
+ * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t
+ *
+ * @return
+ * - \ref NVML_SUCCESS if counters were successfully reset
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState);
+
+/**
+ * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver
+ * as long as no other processes are attached. If other processes are attached, this call will return
+ * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the
+ * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called
+ * to initiate the draining state is if that process was using, and is still using, a GPU before the
+ * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled
+ * prior to this call.
+ *
+ * For long-running NVML processes please note that this will change the enumeration of current GPUs.
+ * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2.
+ * Also, device handles after the removed GPU will not be valid and must be re-established.
+ * Must be run as administrator.
+ * For Linux only.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo The PCI address of the GPU to be removed
+ * @param gpuState Whether the GPU is to be removed, from the OS
+ * see \ref nvmlDetachGpuState_t
+ * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t
+ *
+ * @return
+ * - \ref NVML_SUCCESS if counters were successfully reset
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
+ * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed
+ */
+nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState);
+
+/**
+ * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that
+ * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device.
+ * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes
+ * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order.
+ *
+ * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds
+ * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery.
+ *
+ * Must be run as administrator.
+ * For Linux only.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device
+ * fields are used in this call.
+ *
+ * @return
+ * - \ref NVML_SUCCESS if counters were successfully reset
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature
+ * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature
+ * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlFieldValueQueries Field Value Queries
+ * This chapter describes NVML operations that are associated with retrieving Field Values from NVML
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Request values for a list of fields for a device. This API allows multiple fields to be queried at once.
+ * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs
+ * will be populated from a single call rather than making a driver call for each fieldId.
+ *
+ * @param device The device handle of the GPU to request field values for
+ * @param valuesCount Number of entries in values that should be retrieved
+ * @param values Array of \a valuesCount structures to hold field values.
+ * Each value's fieldId must be populated prior to this call
+ *
+ * @return
+ * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must
+ * check the nvmlReturn field of each value for each individual
+ * status
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
+
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlGridQueries Grid Queries
+ * This chapter describes NVML operations that are associated with NVIDIA GRID products.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * This method is used to get the virtualization mode corresponding to the GPU.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device Identifier of the target device
+ * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pVirtualMode is fetched
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlGridCommands Grid Commands
+ * This chapter describes NVML operations that are associated with NVIDIA GRID products.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * This method is used to set the virtualization mode corresponding to the GPU.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device Identifier of the target device
+ * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a pVirtualMode is set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported.
+ * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client.
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlVgpu vGPU Management
+ * @{
+ *
+ * Set of APIs supporting GRID vGPU
+ */
+/***************************************************************************************************/
+
+/**
+ * Retrieve the supported vGPU types on a physical GPU (device).
+ *
+ * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
+ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
+ * is used to return the number of vGPU types written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
+ * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0.
+ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported.
+ *
+ * @param device The identifier of the target device
+ * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types
+ * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid
+ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
+ * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
+
+/**
+ * Retrieve the currently creatable vGPU types on a physical GPU (device).
+ *
+ * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
+ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
+ * is used to return the number of vGPU types written to the buffer.
+ *
+ * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types
+ * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable
+ * list will be restricted to whatever vGPU type is already running on the device.
+ *
+ * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
+ * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0.
+ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable.
+ *
+ * @param device The identifier of the target device
+ * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types
+ * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
+ * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
+
+/**
+ * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param vgpuTypeClass Pointer to string array to return class in
+ * @param size Size of string
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size);
+
+/**
+ * Retrieve the vGPU type name.
+ *
+ * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not
+ * exceed 64 characters in length (including the NUL terminator). See \ref
+ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param vgpuTypeName Pointer to buffer to return name
+ * @param size Size of buffer
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size);
+
+/**
+ * Retrieve the device ID of a vGPU type.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value
+ * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID);
+
+/**
+ * Retrieve the vGPU framebuffer size in bytes.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param fbSize Pointer to framebuffer size in bytes
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize);
+
+/**
+ * Retrieve count of vGPU's supported display heads.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param numDisplayHeads Pointer to number of display heads
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads);
+
+/**
+ * Retrieve vGPU display head's maximum supported resolution.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param displayIndex Zero-based index of display head
+ * @param xdim Pointer to maximum number of pixels in X dimension
+ * @param ydim Pointer to maximum number of pixels in Y dimension
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex
+ * is out of range.
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim);
+
+/**
+ * Retrieve license requirements for a vGPU type
+ *
+ * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form
+ * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license,
+ * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0".
+ *
+ * The total length of the returned string will not exceed 128 characters, including the NUL terminator.
+ * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param vgpuTypeLicenseString Pointer to buffer to return license info
+ * @param size Size of \a vgpuTypeLicenseString buffer
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size);
+
+/**
+ * Retrieve the static frame rate limit value of the vGPU type
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId Handle to vGPU type
+ * @param frameRateLimit Reference to return the frame rate limit value
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a frameRateLimit is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit);
+
+/**
+ * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param vgpuTypeId Handle to vGPU type
+ * @param vgpuInstanceCount Pointer to get the max number of vGPU instances
+ * that can be created on a deicve for given vgpuTypeId
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device,
+ * or \a vgpuInstanceCount is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount);
+
+/**
+ * Retrieve the active vGPU instances on a device.
+ *
+ * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The
+ * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances
+ * written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount.
+ * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return
+ * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device The identifier of the target device
+ * @param vgpuCount Pointer which passes in the array size as well as get
+ * back the number of types
+ * @param vgpuInstances Pointer to array in which to return list of vGPU instances
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances);
+
+/**
+ * Retrieve the VM ID associated with a vGPU instance.
+ *
+ * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
+ *
+ * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param vmId Pointer to caller-supplied buffer to hold VM ID
+ * @param size Size of buffer in bytes
+ * @param vmIdType Pointer to hold VM ID type
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vmId or \a vmIdType are NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType);
+
+/**
+ * Retrieve the UUID of a vGPU instance.
+ *
+ * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string,
+ * not exceeding 80 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID
+ * @param size Size of buffer in bytes
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a uuid is NULL
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size);
+
+/**
+ * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU.
+ *
+ * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version
+ * string will not exceed 80 characters in length (including the NUL terminator).
+ * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
+ *
+ * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is
+ * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the
+ * NVIDIA driver is loaded and initialized.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param version Caller-supplied buffer to return driver version string
+ * @param length Size of \a version buffer
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a version has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length);
+
+/**
+ * Retrieve the framebuffer usage in bytes.
+ *
+ * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance The identifier of the target instance
+ * @param fbUsage Pointer to framebuffer usage in bytes
+ *
+ * @return
+ * - \ref NVML_SUCCESS successful completion
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a fbUsage is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage);
+
+/**
+ * Retrieve the current licensing state of the vGPU instance.
+ *
+ * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param licensed Reference to return the licensing status
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a licensed has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a licensed is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed);
+
+/**
+ * Retrieve the vGPU type of a vGPU instance.
+ *
+ * Returns the vGPU type ID of vgpu assigned to the vGPU instance.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param vgpuTypeId Reference to return the vgpuTypeId
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a vgpuTypeId has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vgpuTypeId is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId);
+
+/**
+ * Retrieve the frame rate limit set for the vGPU instance.
+ *
+ * Returns the value of the frame rate limit set for the vGPU instance
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param frameRateLimit Reference to return the frame rate limit
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a frameRateLimit has been set
+ * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a frameRateLimit is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit);
+
+/**
+ * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param encoderCapacity Reference to an unsigned int for the encoder capacity
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a encoderQueryType is invalid
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity);
+
+/**
+ * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param encoderCapacity Unsigned int for the encoder capacity value
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a encoderCapacity has been set
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity);
+
+/**
+ * Retrieves current utilization for vGPUs on a physical GPU (device).
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running
+ * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer
+ * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the
+ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values
+ * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to
+ * indicate the returned value type.
+ *
+ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
+ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
+ * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate
+ * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with
+ * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the
+ * buffer is sized for.
+ *
+ * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample
+ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or
+ * destroyed.
+ *
+ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
+ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
+ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
+ *
+ * @param device The identifier for the target device
+ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
+ * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values
+ * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances
+ * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned
+
+ * @return
+ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is
+ * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all
+ * vGPU instances currently executing on the device
+ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
+ nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount,
+ nvmlVgpuInstanceUtilizationSample_t *utilizationSamples);
+
+/**
+ * Retrieves current utilization for processes running on vGPUs on a physical GPU (device).
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on
+ * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the
+ * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running
+ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which
+ * the samples were recorded. Individual utilization values are returned as "unsigned int" values.
+ *
+ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
+ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
+ * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size
+ * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with
+ * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the
+ * buffer is sized for.
+ *
+ * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample
+ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active
+ * in any given sample period.
+ *
+ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
+ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
+ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
+ *
+ * @param device The identifier for the target device
+ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
+ * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances
+ * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned
+
+ * @return
+ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is
+ * passed with a non-NULL \a utilizationSamples
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all
+ * vGPU instances currently executing on the device
+ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
+ unsigned int *vgpuProcessSamplesCount,
+ nvmlVgpuProcessUtilizationSample_t *utilizationSamples);
+/**
+ * Retrieve the GRID licensable features.
+ *
+ * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s)
+ * and their current license status.
+ *
+ * @param device Identifier of the target device
+ * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned
+ *
+ * @return
+ * - \ref NVML_SUCCESS if licensable features are successfully retrieved
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
+
+/**
+ * Retrieves the current encoder statistics of a vGPU Instance
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param sessionCount Reference to an unsigned int for count of active encoder sessions
+ * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions
+ * @param averageLatency Reference to an unsigned int for encode latency in microseconds
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL
+ * or \a vgpuInstance is invalid.
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount,
+ unsigned int *averageFps, unsigned int *averageLatency);
+
+/**
+ * Retrieves information about all active encoder sessions on a vGPU Instance.
+ *
+ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
+ * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
+ * written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the active session array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
+ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
+ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance Identifier of the target vGPU instance
+ * @param sessionCount Reference to caller supplied array size, and returns
+ * the number of sessions.
+ * @param sessionInfo Reference to caller supplied array in which the list
+ * of session information us returned.
+ *
+ * @return
+ * - \ref NVML_SUCCESS if \a sessionInfo is fetched
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is
+ returned in \a sessionCount
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL or \a vgpuInstance is invalid..
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo);
+
+/**
+ * Retrieves the current utilization and process ID
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running.
+ * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at
+ * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization
+ * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values
+ * are returned as "unsigned int" values.
+ *
+ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
+ * \a utilization set to NULL. The caller should allocate a buffer of size
+ * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed
+ * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for.
+ *
+ * On successful return, the function updates \a processSamplesCount with the number of process utilization sample
+ * structures that were actually written. This may differ from a previously read value as instances are created or
+ * destroyed.
+ *
+ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
+ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
+ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
+ *
+ * @param device The identifier of the target device
+ * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned
+ * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running
+ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
+
+ * @return
+ * - \ref NVML_SUCCESS if \a utilization has been populated
+ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
+ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization,
+ unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvml vGPU Migration
+ * This chapter describes NVML operations that are associated with vGPU Migration.
+ * @{
+ */
+/***************************************************************************************************/
+
+/**
+ * vGPU metadata structure.
+ */
+typedef struct nvmlVgpuMetadata_st
+{
+ unsigned int version; //!< Current version of the structure
+ unsigned int revision; //!< Current revision of the structure
+ nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields
+ char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest
+ char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host
+ unsigned int reserved[8]; //!< Reserved for internal use
+ unsigned int opaqueDataSize; //!< Size of opaque data field in bytes
+ char opaqueData[4]; //!< Opaque data
+} nvmlVgpuMetadata_t;
+
+/**
+ * Physical GPU metadata structure
+ */
+typedef struct nvmlVgpuPgpuMetadata_st
+{
+ unsigned int version; //!< Current version of the structure
+ unsigned int revision; //!< Current revision of the structure
+ char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version
+ unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld
+ unsigned int reserved[7]; //!< Reserved for internal use
+ unsigned int opaqueDataSize; //!< Size of opaque data field in bytes
+ char opaqueData[4]; //!< Opaque data
+} nvmlVgpuPgpuMetadata_t;
+
+/**
+ * vGPU VM compatibility codes
+ */
+typedef enum nvmlVgpuVmCompatibility_enum
+{
+ NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable
+ NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5)
+ NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4)
+ NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3)
+ NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0)
+} nvmlVgpuVmCompatibility_t;
+
+/**
+ * vGPU-pGPU compatibility limit codes
+ */
+typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum
+{
+ NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited.
+ NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< Compatibility is limited by host driver version.
+ NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version.
+ NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware.
+ NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor.
+} nvmlVgpuPgpuCompatibilityLimitCode_t;
+
+/**
+ * vGPU-pGPU compatibility structure
+ */
+typedef struct nvmlVgpuPgpuCompatibility_st
+{
+ nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t
+ nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t
+} nvmlVgpuPgpuCompatibility_t;
+
+/**
+ * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM
+ * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section
+ * containing internal state.
+ *
+ * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are
+ * dependent on information obtained from the guest VM, which may not yet have reached a state where that information
+ * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field.
+ *
+ * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide
+ * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM.
+ *
+ * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure
+ * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
+ * in \a bufferSize.
+ *
+ * @param vgpuInstance vGPU instance handle
+ * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written
+ * @param bufferSize Size of vgpuMetadata buffer
+ *
+ * @return
+ * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is invalid; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0.
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize);
+
+/**
+ * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about
+ * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section
+ * containing internal state.
+ *
+ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata
+ * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
+ * in \a bufferSize.
+ *
+ * @param device The identifier of the target device
+ * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written
+ * @param bufferSize Pointer to size of \a pgpuMetadata buffer
+ *
+ * @return
+ * - \ref NVML_SUCCESS GPU metadata structure was successfully returned
+ * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0.
+ * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize);
+
+/**
+ * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a
+ * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the
+ * physical GPU.
+ *
+ * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The
+ * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility
+ * with the physical GPU is limited, a limit code indicates the factor limiting compability.
+ * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details).
+ *
+ * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to
+ * boot a given vGPU or associated VM.
+ *
+ * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure
+ * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure
+ * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info
+ *
+ * @return
+ * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned
+ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL
+ * - \ref NVML_ERROR_UNKNOWN on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo);
+
+/** @} */
+
+/**
+ * NVML API versioning support
+ */
+#if defined(__NVML_API_VERSION_INTERNAL)
+#undef nvmlDeviceRemoveGpu
+#undef nvmlDeviceGetNvLinkRemotePciInfo
+#undef nvmlDeviceGetPciInfo
+#undef nvmlDeviceGetCount
+#undef nvmlDeviceGetHandleByIndex
+#undef nvmlDeviceGetHandleByPciBusId
+#undef nvmlInit
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
new file mode 100644
index 000000000..a3d162c0e
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
@@ -0,0 +1,46 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+#include
+#include
+
+#include "nvml_dl.h"
+
+#define DLSYM(x, sym) \
+do { \
+ dlerror(); \
+ x = dlsym(handle, #sym); \
+ if (dlerror() != NULL) { \
+ return (NVML_ERROR_FUNCTION_NOT_FOUND); \
+ } \
+} while (0)
+
+typedef nvmlReturn_t (*nvmlSym_t)();
+
+static void *handle;
+
+nvmlReturn_t NVML_DL(nvmlInit)(void)
+{
+ handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
+ if (handle == NULL) {
+ return (NVML_ERROR_LIBRARY_NOT_FOUND);
+ }
+ return (nvmlInit());
+}
+
+nvmlReturn_t NVML_DL(nvmlShutdown)(void)
+{
+ nvmlReturn_t r = nvmlShutdown();
+ if (r != NVML_SUCCESS) {
+ return (r);
+ }
+ return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
+}
+
+nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
+ nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
+{
+ nvmlSym_t sym;
+
+ DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
+ return ((*sym)(dev1, dev2, info));
+}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
new file mode 100644
index 000000000..628f0b3a2
--- /dev/null
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+#ifndef _NVML_DL_H_
+#define _NVML_DL_H_
+
+#include "nvml.h"
+
+#define NVML_DL(x) x##_dl
+
+extern nvmlReturn_t NVML_DL(nvmlInit)(void);
+extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
+extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
+ nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
+
+#endif // _NVML_DL_H_
diff --git a/vendor/vendor.json b/vendor/vendor.json
index 941f55cfa..779cb773f 100644
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@@ -9,6 +9,8 @@
{"path":"github.com/Azure/go-ansiterm/winterm","checksumSHA1":"jBimnggjIiFUjaImNoJhSVLtdzw=","revision":"fa152c58bc15761d0200cb75fe958b89a9d4888e","revisionTime":"2016-06-22T17:32:16Z"},
{"path":"github.com/DataDog/datadog-go/statsd","checksumSHA1":"WvApwvvSe3i/3KO8300dyeFmkbI=","revision":"b10af4b12965a1ad08d164f57d14195b4140d8de","revisionTime":"2017-08-09T10:47:06Z"},
{"path":"github.com/Microsoft/go-winio","checksumSHA1":"AzjRkOQtVBTwIw4RJLTygFhJs3s=","revision":"f533f7a102197536779ea3a8cb881d639e21ec5a","revisionTime":"2017-05-24T00:36:31Z"},
+ {"path":"github.com/NVIDIA/gpu-monitoring-tools","checksumSHA1":"kF1vk+8Xvb3nGBiw9+qbUc0SZ4M=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
+ {"path":"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml","checksumSHA1":"P8FATSSgpe5A17FyPrGpsX95Xw8=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
{"path":"github.com/NYTimes/gziphandler","checksumSHA1":"jktW57+vJsziNVPeXMCoujTzdW4=","revision":"97ae7fbaf81620fe97840685304a78a306a39c64","revisionTime":"2017-09-16T00:36:49Z"},
{"path":"github.com/Nvveen/Gotty","checksumSHA1":"Aqy8/FoAIidY/DeQ5oTYSZ4YFVc=","revision":"cd527374f1e5bff4938207604a14f2e38a9cf512","revisionTime":"2012-06-04T00:48:16Z"},
{"path":"github.com/RackSec/srslog","checksumSHA1":"OTN4c1F0p+mEG2CpkU1Kuavupf0=","revision":"259aed10dfa74ea2961eddd1d9847619f6e98837","revisionTime":"2016-01-20T22:33:50Z"},