diff --git a/helper/funcs.go b/helper/funcs.go index 49b300c24..4d8f53b1a 100644 --- a/helper/funcs.go +++ b/helper/funcs.go @@ -57,11 +57,16 @@ func Int64ToPtr(i int64) *int64 { return &i } -// UintToPtr returns the pointer to an uint +// Uint64ToPtr returns the pointer to an uint64 func Uint64ToPtr(u uint64) *uint64 { return &u } +// UintToPtr returns the pointer to an uint +func UintToPtr(u uint) *uint { + return &u +} + // StringToPtr returns the pointer to a string func StringToPtr(str string) *string { return &str diff --git a/plugins/device/cmd/nvidia/README.md b/plugins/device/cmd/nvidia/README.md new file mode 100644 index 000000000..55d1e7f13 --- /dev/null +++ b/plugins/device/cmd/nvidia/README.md @@ -0,0 +1,23 @@ +This package provides an implementation of nvidia device plugin + +# Behavior + +Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period. + +# Config + +The configuration should be passed via an HCL file that begins with a top level `config` stanza: + +``` +config { + ignored_gpu_ids = ["uuid1", "uuid2"] + fingerprint_period = "5s" + stats_period = "5s" +} +``` + +The valid configuration options are: + +* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad +* `fingerprint_period` (`string`: `"5s"`): The interval to repeat fingerprint process to identify possible changes. +* `stats_period` (`string`: `"5s"`): The interval at which to emit statistics about the devices. diff --git a/plugins/device/cmd/nvidia/cmd/main.go b/plugins/device/cmd/nvidia/cmd/main.go new file mode 100644 index 000000000..1b5b0c41c --- /dev/null +++ b/plugins/device/cmd/nvidia/cmd/main.go @@ -0,0 +1,18 @@ +package main + +import ( + log "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/nomad/plugins" + "github.com/hashicorp/nomad/plugins/device/cmd/nvidia" +) + +func main() { + // Serve the plugin + plugins.Serve(factory) +} + +// factory returns a new instance of the Nvidia GPU plugin +func factory(log log.Logger) interface{} { + return nvidia.NewNvidiaDevice(log) +} diff --git a/plugins/device/cmd/nvidia/device.go b/plugins/device/cmd/nvidia/device.go new file mode 100644 index 000000000..2613d8e77 --- /dev/null +++ b/plugins/device/cmd/nvidia/device.go @@ -0,0 +1,209 @@ +package nvidia + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + log "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/nomad/plugins/base" + "github.com/hashicorp/nomad/plugins/device" + "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" + "github.com/hashicorp/nomad/plugins/shared/hclspec" +) + +const ( + // pluginName is the name of the plugin + pluginName = "nvidia-gpu" + + // vendor is the vendor providing the devices + vendor = "nvidia" + + // deviceType is the type of device being returned + deviceType = device.DeviceTypeGPU + + // notAvailable value is returned to nomad server in case some properties were + // undetected by nvml driver + notAvailable = "N/A" +) + +const ( + // Nvidia-container-runtime environment variable names + nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" +) + +var ( + // pluginInfo describes the plugin + pluginInfo = &base.PluginInfoResponse{ + Type: base.PluginTypeDevice, + PluginApiVersion: "0.0.1", // XXX This should be an array and should be consts + PluginVersion: "0.1.0", + Name: pluginName, + } + + // configSpec is the specification of the plugin's configuration + configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ + "ignored_gpu_ids": hclspec.NewDefault( + hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), + hclspec.NewLiteral("[]"), + ), + "fingerprint_period": hclspec.NewDefault( + hclspec.NewAttr("fingerprint_period", "string", false), + hclspec.NewLiteral("\"5s\""), + ), + "stats_period": hclspec.NewDefault( + hclspec.NewAttr("stats_period", "string", false), + hclspec.NewLiteral("\"5s\""), + ), + }) +) + +// Config contains configuration information for the plugin. +type Config struct { + IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` + FingerprintPeriod string `codec:"fingerprint_period"` + StatsPeriod string `codec:"stats_period"` +} + +// NvidiaDevice contains all plugin specific data +type NvidiaDevice struct { + // nvmlClient is used to get data from nvidia + nvmlClient nvml.NvmlClient + + // nvmlClientInitializationError holds an error retrieved during + // nvmlClient initialization + nvmlClientInitializationError error + + // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad + ignoredGPUIDs map[string]struct{} + + // fingerprintPeriod is how often we should call nvml to get list of devices + fingerprintPeriod time.Duration + + // statsPeriod is how often we should collect statistics for fingerprinted + // devices. + statsPeriod time.Duration + + // devices is the set of detected eligible devices + devices map[string]struct{} + deviceLock sync.RWMutex + + logger log.Logger +} + +// NewNvidiaDevice returns a new nvidia device plugin. +func NewNvidiaDevice(log log.Logger) *NvidiaDevice { + nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient() + logger := log.Named(pluginName) + if nvmlClientInitializationError != nil { + logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError) + } + return &NvidiaDevice{ + logger: logger, + devices: make(map[string]struct{}), + ignoredGPUIDs: make(map[string]struct{}), + nvmlClient: nvmlClient, + nvmlClientInitializationError: nvmlClientInitializationError, + } +} + +// PluginInfo returns information describing the plugin. +func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) { + return pluginInfo, nil +} + +// ConfigSchema returns the plugins configuration schema. +func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) { + return configSpec, nil +} + +// SetConfig is used to set the configuration of the plugin. +func (d *NvidiaDevice) SetConfig(data []byte) error { + var config Config + if err := base.MsgPackDecode(data, &config); err != nil { + return err + } + + for _, ignoredGPUId := range config.IgnoredGPUIDs { + d.ignoredGPUIDs[ignoredGPUId] = struct{}{} + } + + period, err := time.ParseDuration(config.FingerprintPeriod) + if err != nil { + return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err) + } + d.fingerprintPeriod = period + + // Convert the stats period + speriod, err := time.ParseDuration(config.StatsPeriod) + if err != nil { + return fmt.Errorf("failed to parse stats period %q: %v", config.StatsPeriod, err) + } + d.statsPeriod = speriod + + return nil +} + +// Fingerprint streams detected devices. If device changes are detected or the +// devices health changes, messages will be emitted. +func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { + outCh := make(chan *device.FingerprintResponse) + go d.fingerprint(ctx, outCh) + return outCh, nil +} + +type reservationError struct { + notExistingIDs []string +} + +func (e *reservationError) Error() string { + return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ",")) +} + +// Reserve returns information on how to mount given devices. +// Assumption is made that nomad server is responsible for correctness of +// GPU allocations, handling tricky cases such as double-allocation of single GPU +func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) { + if len(deviceIDs) == 0 { + return &device.ContainerReservation{}, nil + } + // Due to the asynchronous nature of NvidiaPlugin, there is a possibility + // of race condition + // + // Timeline: + // 1 - fingerprint reports that GPU with id "1" is present + // 2 - the following events happen at the same time: + // a) server decides to allocate GPU with id "1" + // b) fingerprint check reports that GPU with id "1" is no more present + // + // The latest and always valid version of fingerprinted ids are stored in + // d.devices map. To avoid this race condition an error is returned if + // any of provided deviceIDs is not found in d.devices map + d.deviceLock.RLock() + var notExistingIDs []string + for _, id := range deviceIDs { + if _, deviceIDExists := d.devices[id]; !deviceIDExists { + notExistingIDs = append(notExistingIDs, id) + } + } + d.deviceLock.RUnlock() + if len(notExistingIDs) != 0 { + return nil, &reservationError{notExistingIDs} + } + + return &device.ContainerReservation{ + Envs: map[string]string{ + nvidiaVisibleDevices: strings.Join(deviceIDs, ","), + }, + }, nil +} + +// Stats streams statistics for the detected devices. +func (d *NvidiaDevice) Stats(ctx context.Context) (<-chan *device.StatsResponse, error) { + outCh := make(chan *device.StatsResponse) + go d.stats(ctx, outCh) + return outCh, nil +} diff --git a/plugins/device/cmd/nvidia/device_test.go b/plugins/device/cmd/nvidia/device_test.go new file mode 100644 index 000000000..b1fa4b17a --- /dev/null +++ b/plugins/device/cmd/nvidia/device_test.go @@ -0,0 +1,115 @@ +package nvidia + +import ( + "testing" + + "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" + + hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/plugins/device" + "github.com/stretchr/testify/require" +) + +type MockNvmlClient struct { + FingerprintError error + FingerprintResponseReturned *nvml.FingerprintData + + StatsError error + StatsResponseReturned []*nvml.StatsData +} + +func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) { + return c.FingerprintResponseReturned, c.FingerprintError +} + +func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) { + return c.StatsResponseReturned, c.StatsError +} + +func TestReserve(t *testing.T) { + for _, testCase := range []struct { + Name string + ExpectedReservation *device.ContainerReservation + ExpectedError error + Device *NvidiaDevice + RequestedIDs []string + }{ + { + Name: "All RequestedIDs are not managed by Device", + ExpectedReservation: nil, + ExpectedError: &reservationError{[]string{ + "UUID1", + "UUID2", + "UUID3", + }}, + RequestedIDs: []string{ + "UUID1", + "UUID2", + "UUID3", + }, + Device: &NvidiaDevice{ + logger: hclog.NewNullLogger(), + }, + }, + { + Name: "Some RequestedIDs are not managed by Device", + ExpectedReservation: nil, + ExpectedError: &reservationError{[]string{ + "UUID1", + "UUID2", + }}, + RequestedIDs: []string{ + "UUID1", + "UUID2", + "UUID3", + }, + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID3": {}, + }, + logger: hclog.NewNullLogger(), + }, + }, + { + Name: "All RequestedIDs are managed by Device", + ExpectedReservation: &device.ContainerReservation{ + Envs: map[string]string{ + nvidiaVisibleDevices: "UUID1,UUID2,UUID3", + }, + }, + ExpectedError: nil, + RequestedIDs: []string{ + "UUID1", + "UUID2", + "UUID3", + }, + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + logger: hclog.NewNullLogger(), + }, + }, + { + Name: "No IDs requested", + ExpectedReservation: &device.ContainerReservation{}, + ExpectedError: nil, + RequestedIDs: nil, + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + logger: hclog.NewNullLogger(), + }, + }, + } { + actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs) + req := require.New(t) + req.Equal(testCase.ExpectedReservation, actualReservation) + req.Equal(testCase.ExpectedError, actualError) + } +} diff --git a/plugins/device/cmd/nvidia/fingerprint.go b/plugins/device/cmd/nvidia/fingerprint.go new file mode 100644 index 000000000..171b01844 --- /dev/null +++ b/plugins/device/cmd/nvidia/fingerprint.go @@ -0,0 +1,235 @@ +package nvidia + +import ( + "context" + "fmt" + "time" + + "github.com/hashicorp/nomad/plugins/device" + "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" +) + +const ( + // Attribute names for reporting Fingerprint output + MemoryMiBAttr = "memory_mib" + PowerWAttr = "power_w" + BAR1MiBAttr = "bar1_mib" + DriverVersionAttr = "driver_version" + CoresClockMHzAttr = "cores_clock_mhz" + MemoryClockMHzAttr = "memory_clock_mhz" + PCIBandwidthMBPerSAttr = "pci_bandwidth_mb/s" + DisplayStateAttr = "display_state" + PersistenceModeAttr = "persistence_mode" +) + +// fingerprint is the long running goroutine that detects hardware +func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) { + defer close(devices) + + if d.nvmlClientInitializationError != nil { + d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError) + // write empty fingerprint response to let server know that there are + // no working Nvidia GPU units + devices <- device.NewFingerprint() + return + } + + // Create a timer that will fire immediately for the first detection + ticker := time.NewTimer(0) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + ticker.Reset(d.fingerprintPeriod) + } + d.writeFingerprintToChannel(devices) + } +} + +// writeFingerprintToChannel makes nvml call and writes response to channel +func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) { + fingerprintData, err := d.nvmlClient.GetFingerprintData() + + if err != nil { + d.logger.Error("failed to get fingerprint nvidia devices", "error", err) + devices <- device.NewFingerprintError(err) + return + } + + // ignore devices from fingerprint output + fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs) + // check if any device health was updated or any device was added to host + if !d.fingerprintChanged(fingerprintDevices) { + return + } + + commonAttributes := map[string]string{ + DriverVersionAttr: fingerprintData.DriverVersion, + } + + // Group all FingerprintDevices by DeviceName attribute + deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData) + for _, device := range fingerprintDevices { + deviceName := device.DeviceName + if deviceName == nil { + // nvml driver was not able to detect device name. This kind + // of devices are placed to single group with 'notAvailable' name + notAvailableCopy := notAvailable + deviceName = ¬AvailableCopy + } + + deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device) + } + + // Build Fingerprint response with computed groups and send it over the channel + deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName)) + for groupName, devices := range deviceListByDeviceName { + deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes)) + } + devices <- device.NewFingerprint(deviceGroups...) +} + +// ignoreFingerprintedDevices excludes ignored devices from fingerprint output +func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData { + var result []*nvml.FingerprintDeviceData + for _, fingerprintDevice := range deviceData { + if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored { + result = append(result, fingerprintDevice) + } + } + return result +} + +// fingerprintChanged checks if there are any previously unseen nvidia devices located +// or any of fingerprinted nvidia devices disappeared since the last fingerprint run. +// Also, this func updates device map on NvidiaDevice with the latest data +func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool { + d.deviceLock.Lock() + defer d.deviceLock.Unlock() + + changeDetected := false + // check if every device in allDevices is in d.devices + for _, device := range allDevices { + if _, ok := d.devices[device.UUID]; !ok { + changeDetected = true + } + } + + // check if every device in d.devices is in allDevices + fingerprintDeviceMap := make(map[string]struct{}) + for _, device := range allDevices { + fingerprintDeviceMap[device.UUID] = struct{}{} + } + for id := range d.devices { + if _, ok := fingerprintDeviceMap[id]; !ok { + changeDetected = true + } + } + + d.devices = fingerprintDeviceMap + return changeDetected +} + +// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice +func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]string) *device.DeviceGroup { + // deviceGroup without devices makes no sense -> return nil when no devices are provided + if len(deviceList) == 0 { + return nil + } + + devices := make([]*device.Device, len(deviceList)) + for index, dev := range deviceList { + devices[index] = &device.Device{ + ID: dev.UUID, + // all fingerprinted devices are "healthy" for now + // to get real health data -> dcgm bindings should be used + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: dev.PCIBusID, + }, + } + } + + deviceGroup := &device.DeviceGroup{ + Vendor: vendor, + Type: deviceType, + Name: groupName, + Devices: devices, + // Assumption made that devices with the same DeviceName have the same + // attributes like amount of memory, power, bar1memory etc + Attributes: attributesFromFingerprintDeviceData(deviceList[0]), + } + + // Extend attribute map with common attributes + for attributeKey, attributeValue := range commonAttributes { + deviceGroup.Attributes[attributeKey] = attributeValue + } + + return deviceGroup +} + +// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData +// struct to device.DeviceGroup.Attributes format (map[string]string) +// this function performs all nil checks for FingerprintDeviceData pointers +func attributesFromFingerprintDeviceData(fingerprintDeviceData *nvml.FingerprintDeviceData) map[string]string { + // The following fields in FingerprintDeviceData are pointers, so they can be nil + // In case they are nil -> return 'notAvailable' constant instead + var ( + MemoryMiB string + PowerW string + BAR1MiB string + CoresClockMHz string + MemoryClockMHz string + PCIBandwidthMBPerS string + ) + + if fingerprintDeviceData.MemoryMiB == nil { + MemoryMiB = notAvailable + } else { + MemoryMiB = fmt.Sprint(*fingerprintDeviceData.MemoryMiB) + } + + if fingerprintDeviceData.PowerW == nil { + PowerW = notAvailable + } else { + PowerW = fmt.Sprint(*fingerprintDeviceData.PowerW) + } + + if fingerprintDeviceData.BAR1MiB == nil { + BAR1MiB = notAvailable + } else { + BAR1MiB = fmt.Sprint(*fingerprintDeviceData.BAR1MiB) + } + + if fingerprintDeviceData.CoresClockMHz == nil { + CoresClockMHz = notAvailable + } else { + CoresClockMHz = fmt.Sprint(*fingerprintDeviceData.CoresClockMHz) + } + + if fingerprintDeviceData.MemoryClockMHz == nil { + MemoryClockMHz = notAvailable + } else { + MemoryClockMHz = fmt.Sprint(*fingerprintDeviceData.MemoryClockMHz) + } + + if fingerprintDeviceData.PCIBandwidthMBPerS == nil { + PCIBandwidthMBPerS = notAvailable + } else { + PCIBandwidthMBPerS = fmt.Sprint(*fingerprintDeviceData.PCIBandwidthMBPerS) + } + + return map[string]string{ + DisplayStateAttr: fingerprintDeviceData.DisplayState, + PersistenceModeAttr: fingerprintDeviceData.PersistenceMode, + MemoryMiBAttr: MemoryMiB, + PowerWAttr: PowerW, + BAR1MiBAttr: BAR1MiB, + CoresClockMHzAttr: CoresClockMHz, + MemoryClockMHzAttr: MemoryClockMHz, + PCIBandwidthMBPerSAttr: PCIBandwidthMBPerS, + } + +} diff --git a/plugins/device/cmd/nvidia/fingerprint_test.go b/plugins/device/cmd/nvidia/fingerprint_test.go new file mode 100644 index 000000000..b181ebd65 --- /dev/null +++ b/plugins/device/cmd/nvidia/fingerprint_test.go @@ -0,0 +1,1243 @@ +package nvidia + +import ( + "context" + "errors" + "sort" + "testing" + + hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/plugins/device" + "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" + "github.com/stretchr/testify/require" +) + +func TestIgnoreFingerprintedDevices(t *testing.T) { + for _, testCase := range []struct { + Name string + DeviceData []*nvml.FingerprintDeviceData + IgnoredGPUIds map[string]struct{} + ExpectedResult []*nvml.FingerprintDeviceData + }{ + { + Name: "Odd ignored", + DeviceData: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName3"), + UUID: "UUID3", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + }, + IgnoredGPUIds: map[string]struct{}{ + "UUID2": {}, + }, + ExpectedResult: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName3"), + UUID: "UUID3", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + }, + }, + { + Name: "Even ignored", + DeviceData: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName3"), + UUID: "UUID3", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + }, + IgnoredGPUIds: map[string]struct{}{ + "UUID1": {}, + "UUID3": {}, + }, + ExpectedResult: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + }, + }, + { + Name: "All ignored", + DeviceData: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName3"), + UUID: "UUID3", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + }, + IgnoredGPUIds: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + ExpectedResult: nil, + }, + { + Name: "No ignored", + DeviceData: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName3"), + UUID: "UUID3", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + }, + IgnoredGPUIds: map[string]struct{}{}, + ExpectedResult: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + { + DeviceData: &nvml.DeviceData{ + DeviceName: helper.StringToPtr("DeviceName3"), + UUID: "UUID3", + MemoryMiB: helper.Uint64ToPtr(1000), + }, + }, + }, + }, + { + Name: "No DeviceData provided", + DeviceData: nil, + IgnoredGPUIds: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + ExpectedResult: nil, + }, + } { + actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + } +} + +func TestCheckFingerprintUpdates(t *testing.T) { + for _, testCase := range []struct { + Name string + Device *NvidiaDevice + AllDevices []*nvml.FingerprintDeviceData + DeviceMapAfterMethodCall map[string]struct{} + ExpectedResult bool + }{ + { + Name: "No updates", + Device: &NvidiaDevice{devices: map[string]struct{}{ + "1": {}, + "2": {}, + "3": {}, + }}, + AllDevices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "3", + }, + }, + }, + ExpectedResult: false, + DeviceMapAfterMethodCall: map[string]struct{}{ + "1": {}, + "2": {}, + "3": {}, + }, + }, + { + Name: "New Device Appeared", + Device: &NvidiaDevice{devices: map[string]struct{}{ + "1": {}, + "2": {}, + "3": {}, + }}, + AllDevices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "3", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "I am new", + }, + }, + }, + ExpectedResult: true, + DeviceMapAfterMethodCall: map[string]struct{}{ + "1": {}, + "2": {}, + "3": {}, + "I am new": {}, + }, + }, + { + Name: "Device disappeared", + Device: &NvidiaDevice{devices: map[string]struct{}{ + "1": {}, + "2": {}, + "3": {}, + }}, + AllDevices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + }, + }, + }, + ExpectedResult: true, + DeviceMapAfterMethodCall: map[string]struct{}{ + "1": {}, + "2": {}, + }, + }, + { + Name: "No devices in NvidiaDevice map", + Device: &NvidiaDevice{}, + AllDevices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "3", + }, + }, + }, + ExpectedResult: true, + DeviceMapAfterMethodCall: map[string]struct{}{ + "1": {}, + "2": {}, + "3": {}, + }, + }, + { + Name: "No devices detected", + Device: &NvidiaDevice{devices: map[string]struct{}{ + "1": {}, + "2": {}, + "3": {}, + }}, + AllDevices: nil, + ExpectedResult: true, + DeviceMapAfterMethodCall: map[string]struct{}{}, + }, + } { + actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices) + req := require.New(t) + // check that function returns valid "updated / not updated" state + req.Equal(testCase.ExpectedResult, actualResult) + // check that function propely updates devices map + req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall) + } +} + +func TestAttributesFromFingerprintDeviceData(t *testing.T) { + for _, testCase := range []struct { + Name string + FingerprintDeviceData *nvml.FingerprintDeviceData + ExpectedResult map[string]string + }{ + { + Name: "All attributes are not nil", + FingerprintDeviceData: &nvml.FingerprintDeviceData{ + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(256), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + ExpectedResult: map[string]string{ + MemoryMiBAttr: "256", + PowerWAttr: "2", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Name: "MemoryMiB is nil and has to be replaced to N/A", + FingerprintDeviceData: &nvml.FingerprintDeviceData{ + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: nil, + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + ExpectedResult: map[string]string{ + MemoryMiBAttr: notAvailable, + PowerWAttr: "2", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Name: "PowerW is nil and has to be replaced to N/A", + FingerprintDeviceData: &nvml.FingerprintDeviceData{ + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(256), + PowerW: nil, + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + ExpectedResult: map[string]string{ + MemoryMiBAttr: "256", + PowerWAttr: notAvailable, + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Name: "BAR1MiB is nil and has to be replaced to N/A", + FingerprintDeviceData: &nvml.FingerprintDeviceData{ + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(256), + PowerW: helper.UintToPtr(2), + BAR1MiB: nil, + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + ExpectedResult: map[string]string{ + MemoryMiBAttr: "256", + PowerWAttr: "2", + BAR1MiBAttr: notAvailable, + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Name: "PCIBandwidthMBPerS is nil and has to be replaced to N/A", + FingerprintDeviceData: &nvml.FingerprintDeviceData{ + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(256), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: nil, + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + ExpectedResult: map[string]string{ + MemoryMiBAttr: "256", + PowerWAttr: "2", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: notAvailable, + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Name: "CoresClockMHz is nil and has to be replaced to N/A", + FingerprintDeviceData: &nvml.FingerprintDeviceData{ + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(256), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: nil, + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + ExpectedResult: map[string]string{ + MemoryMiBAttr: "256", + PowerWAttr: "2", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: notAvailable, + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Name: "MemoryClockMHz is nil and has to be replaced to N/A", + FingerprintDeviceData: &nvml.FingerprintDeviceData{ + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(256), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: nil, + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + ExpectedResult: map[string]string{ + MemoryMiBAttr: "256", + PowerWAttr: "2", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: notAvailable, + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + } { + actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + } +} + +func TestDeviceGroupFromFingerprintData(t *testing.T) { + for _, testCase := range []struct { + Name string + GroupName string + Devices []*nvml.FingerprintDeviceData + CommonAttributes map[string]string + ExpectedResult *device.DeviceGroup + }{ + { + Name: "Devices are provided", + GroupName: "Type1", + Devices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(100), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(100), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID2", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + ExpectedResult: &device.DeviceGroup{ + Vendor: vendor, + Type: deviceType, + Name: "Type1", + Devices: []*device.Device{ + { + ID: "1", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID1", + }, + }, + { + ID: "2", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID2", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "100", + PowerWAttr: "2", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + }, + { + Name: "Devices and common attributes are provided", + GroupName: "Type1", + Devices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(100), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + DeviceName: helper.StringToPtr("Type1"), + MemoryMiB: helper.Uint64ToPtr(100), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID2", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + CommonAttributes: map[string]string{ + DriverVersionAttr: "1", + }, + ExpectedResult: &device.DeviceGroup{ + Vendor: vendor, + Type: deviceType, + Name: "Type1", + Devices: []*device.Device{ + { + ID: "1", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID1", + }, + }, + { + ID: "2", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID2", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "100", + PowerWAttr: "2", + BAR1MiBAttr: "256", + DriverVersionAttr: "1", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + }, + { + Name: "Devices are not provided", + GroupName: "Type1", + CommonAttributes: map[string]string{ + DriverVersionAttr: "1", + }, + Devices: nil, + ExpectedResult: nil, + }, + } { + actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + } +} + +func TestWriteFingerprintToChannel(t *testing.T) { + for _, testCase := range []struct { + Name string + Device *NvidiaDevice + ExpectedWriteToChannel *device.FingerprintResponse + }{ + { + Name: "Check that FingerprintError is handled properly", + Device: &NvidiaDevice{ + nvmlClient: &MockNvmlClient{ + FingerprintError: errors.New(""), + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.FingerprintResponse{ + Error: errors.New(""), + }, + }, + { + Name: "Check ignore devices works correctly", + Device: &NvidiaDevice{ + nvmlClient: &MockNvmlClient{ + FingerprintResponseReturned: &nvml.FingerprintData{ + DriverVersion: "1", + Devices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Name"), + MemoryMiB: helper.Uint64ToPtr(10), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + DeviceName: helper.StringToPtr("Name"), + MemoryMiB: helper.Uint64ToPtr(10), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID2", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + }, + ignoredGPUIDs: map[string]struct{}{ + "1": {}, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.FingerprintResponse{ + Devices: []*device.DeviceGroup{ + { + Vendor: vendor, + Type: deviceType, + Name: "Name", + Devices: []*device.Device{ + { + ID: "2", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID2", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "10", + PowerWAttr: "100", + BAR1MiBAttr: "256", + DriverVersionAttr: "1", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + }, + }, + }, + { + Name: "Check devices are split to multiple device groups 1", + Device: &NvidiaDevice{ + nvmlClient: &MockNvmlClient{ + FingerprintResponseReturned: &nvml.FingerprintData{ + DriverVersion: "1", + Devices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + DeviceName: helper.StringToPtr("Name2"), + MemoryMiB: helper.Uint64ToPtr(11), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID2", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "3", + DeviceName: helper.StringToPtr("Name3"), + MemoryMiB: helper.Uint64ToPtr(12), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID3", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.FingerprintResponse{ + Devices: []*device.DeviceGroup{ + { + Vendor: vendor, + Type: deviceType, + Name: "Name1", + Devices: []*device.Device{ + { + ID: "1", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID1", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "10", + DriverVersionAttr: "1", + PowerWAttr: "100", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Vendor: vendor, + Type: deviceType, + Name: "Name2", + Devices: []*device.Device{ + { + ID: "2", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID2", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "11", + DriverVersionAttr: "1", + PowerWAttr: "100", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Vendor: vendor, + Type: deviceType, + Name: "Name3", + Devices: []*device.Device{ + { + ID: "3", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID3", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "12", + DriverVersionAttr: "1", + PowerWAttr: "100", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + }, + }, + }, + { + Name: "Check devices are split to multiple device groups 2", + Device: &NvidiaDevice{ + nvmlClient: &MockNvmlClient{ + FingerprintResponseReturned: &nvml.FingerprintData{ + DriverVersion: "1", + Devices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + DeviceName: helper.StringToPtr("Name2"), + MemoryMiB: helper.Uint64ToPtr(11), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID2", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "3", + DeviceName: helper.StringToPtr("Name2"), + MemoryMiB: helper.Uint64ToPtr(12), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID3", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.FingerprintResponse{ + Devices: []*device.DeviceGroup{ + { + Vendor: vendor, + Type: deviceType, + Name: "Name1", + Devices: []*device.Device{ + { + ID: "1", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID1", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "10", + DriverVersionAttr: "1", + PowerWAttr: "100", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + { + Vendor: vendor, + Type: deviceType, + Name: "Name2", + Devices: []*device.Device{ + { + ID: "2", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID2", + }, + }, + { + ID: "3", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID3", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "11", + DriverVersionAttr: "1", + PowerWAttr: "100", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + }, + }, + }, + } { + channel := make(chan *device.FingerprintResponse, 1) + testCase.Device.writeFingerprintToChannel(channel) + actualResult := <-channel + // writeFingerprintToChannel iterates over map keys + // and insterts results to an array, so order of elements in output array + // may be different + // actualResult, expectedResult arrays has to be sorted firsted + sort.Slice(actualResult.Devices, func(i, j int) bool { + return actualResult.Devices[i].Name < actualResult.Devices[j].Name + }) + sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool { + return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name + }) + require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) + } +} + +// Test if nonworking driver returns empty fingerprint data +func TestFingerprint(t *testing.T) { + for _, testCase := range []struct { + Name string + Device *NvidiaDevice + ExpectedWriteToChannel *device.FingerprintResponse + }{ + { + Name: "Check that working driver returns valid fingeprint data", + Device: &NvidiaDevice{ + nvmlClientInitializationError: nil, + nvmlClient: &MockNvmlClient{ + FingerprintResponseReturned: &nvml.FingerprintData{ + DriverVersion: "1", + Devices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID1", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID2", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "3", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PCIBusID: "pciBusID3", + PCIBandwidthMBPerS: helper.UintToPtr(1), + CoresClockMHz: helper.UintToPtr(1), + MemoryClockMHz: helper.UintToPtr(1), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.FingerprintResponse{ + Devices: []*device.DeviceGroup{ + { + Vendor: vendor, + Type: deviceType, + Name: "Name1", + Devices: []*device.Device{ + { + ID: "1", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID1", + }, + }, + { + ID: "2", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID2", + }, + }, + { + ID: "3", + Healthy: true, + HwLocality: &device.DeviceLocality{ + PciBusID: "pciBusID3", + }, + }, + }, + Attributes: map[string]string{ + MemoryMiBAttr: "10", + DriverVersionAttr: "1", + PowerWAttr: "100", + BAR1MiBAttr: "256", + PCIBandwidthMBPerSAttr: "1", + CoresClockMHzAttr: "1", + MemoryClockMHzAttr: "1", + DisplayStateAttr: "Enabled", + PersistenceModeAttr: "Enabled", + }, + }, + }, + }, + }, + { + Name: "Check that not working driver returns empty fingeprint data", + Device: &NvidiaDevice{ + nvmlClientInitializationError: errors.New(""), + nvmlClient: &MockNvmlClient{ + FingerprintResponseReturned: &nvml.FingerprintData{ + DriverVersion: "1", + Devices: []*nvml.FingerprintDeviceData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "1", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "2", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + }, + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "3", + DeviceName: helper.StringToPtr("Name1"), + MemoryMiB: helper.Uint64ToPtr(10), + }, + }, + }, + }, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.FingerprintResponse{}, + }, + } { + outCh := make(chan *device.FingerprintResponse) + ctx, cancel := context.WithCancel(context.Background()) + go testCase.Device.fingerprint(ctx, outCh) + result := <-outCh + cancel() + require.New(t).Equal(result, testCase.ExpectedWriteToChannel) + } +} diff --git a/plugins/device/cmd/nvidia/nvml/client.go b/plugins/device/cmd/nvidia/nvml/client.go new file mode 100644 index 000000000..d18dcbe1a --- /dev/null +++ b/plugins/device/cmd/nvidia/nvml/client.go @@ -0,0 +1,194 @@ +package nvml + +import ( + "fmt" +) + +// DeviceData represents common fields for Nvidia device +type DeviceData struct { + UUID string + DeviceName *string + MemoryMiB *uint64 + PowerW *uint + BAR1MiB *uint64 +} + +// FingerprintDeviceData is a superset of DeviceData +// it describes device specific fields returned from +// nvml queries during fingerprinting call +type FingerprintDeviceData struct { + *DeviceData + PCIBandwidthMBPerS *uint + CoresClockMHz *uint + MemoryClockMHz *uint + DisplayState string + PersistenceMode string + PCIBusID string +} + +// FingerprintData represets attributes of driver/devices +type FingerprintData struct { + Devices []*FingerprintDeviceData + DriverVersion string +} + +// StatsData is a superset of DeviceData +// it represents statistics data returned for every Nvidia device +type StatsData struct { + *DeviceData + PowerUsageW *uint + GPUUtilization *uint + MemoryUtilization *uint + EncoderUtilization *uint + DecoderUtilization *uint + TemperatureC *uint + UsedMemoryMiB *uint64 + BAR1UsedMiB *uint64 + ECCErrorsL1Cache *uint64 + ECCErrorsL2Cache *uint64 + ECCErrorsDevice *uint64 +} + +// NvmlClient describes how users would use nvml library +type NvmlClient interface { + GetFingerprintData() (*FingerprintData, error) + GetStatsData() ([]*StatsData, error) +} + +// nvmlClient implements NvmlClient +// Users of this lib are expected to use this struct via NewNvmlClient func +type nvmlClient struct { + driver NvmlDriver +} + +// NewNvmlClient function creates new nvmlClient with real +// NvmlDriver implementation. Also, this func initializes NvmlDriver +func NewNvmlClient() (*nvmlClient, error) { + driver := &nvmlDriver{} + err := driver.Initialize() + if err != nil { + return nil, err + } + return &nvmlClient{ + driver: driver, + }, nil +} + +// GetFingerprintData returns FingerprintData for available Nvidia devices +func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { + /* + nvml fields to be fingerprinted # nvml_library_call + 1 - Driver Version # nvmlSystemGetDriverVersion + 2 - Product Name # nvmlDeviceGetName + 3 - GPU UUID # nvmlDeviceGetUUID + 4 - Total Memory # nvmlDeviceGetMemoryInfo + 5 - Power # nvmlDeviceGetPowerManagementLimit + 6 - PCIBusID # nvmlDeviceGetPciInfo + 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo( + 8 - PCI Bandwidth + 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo + 10 - Display Mode # nvmlDeviceGetDisplayMode + 11 - Persistence Mode # nvmlDeviceGetPersistenceMode + */ + + // Assumed that this method is called with receiver retrieved from + // NewNvmlClient + // because this method handles initialization of NVML library + + driverVersion, err := c.driver.SystemDriverVersion() + if err != nil { + return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err) + } + + numDevices, err := c.driver.DeviceCount() + if err != nil { + return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) + } + + allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices) + + for i := 0; i < int(numDevices); i++ { + deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i)) + if err != nil { + return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err) + } + + allNvidiaGPUResources[i] = &FingerprintDeviceData{ + DeviceData: &DeviceData{ + DeviceName: deviceInfo.Name, + UUID: deviceInfo.UUID, + MemoryMiB: deviceInfo.MemoryMiB, + PowerW: deviceInfo.PowerW, + BAR1MiB: deviceInfo.BAR1MiB, + }, + PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS, + CoresClockMHz: deviceInfo.CoresClockMHz, + MemoryClockMHz: deviceInfo.MemoryClockMHz, + DisplayState: deviceInfo.DisplayState, + PersistenceMode: deviceInfo.PersistenceMode, + PCIBusID: deviceInfo.PCIBusID, + } + } + return &FingerprintData{ + Devices: allNvidiaGPUResources, + DriverVersion: driverVersion, + }, nil +} + +// GetStatsData returns statistics data for all devices on this machine +func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { + /* + nvml fields to be reported to stats api # nvml_library_call + 1 - Used Memory # nvmlDeviceGetMemoryInfo + 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates + 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates + 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization + 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization + 6 - Current GPU Temperature # nvmlDeviceGetTemperature + 7 - Power Draw # nvmlDeviceGetPowerUsage + 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo + 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter + 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter + 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter + */ + + // Assumed that this method is called with receiver retrieved from + // NewNvmlClient + // because this method handles initialization of NVML library + + numDevices, err := c.driver.DeviceCount() + if err != nil { + return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) + } + + allNvidiaGPUStats := make([]*StatsData, numDevices) + + for i := 0; i < int(numDevices); i++ { + deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i)) + if err != nil { + return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err) + } + + allNvidiaGPUStats[i] = &StatsData{ + DeviceData: &DeviceData{ + DeviceName: deviceInfo.Name, + UUID: deviceInfo.UUID, + MemoryMiB: deviceInfo.MemoryMiB, + PowerW: deviceInfo.PowerW, + BAR1MiB: deviceInfo.BAR1MiB, + }, + PowerUsageW: deviceStatus.PowerUsageW, + GPUUtilization: deviceStatus.GPUUtilization, + MemoryUtilization: deviceStatus.MemoryUtilization, + EncoderUtilization: deviceStatus.EncoderUtilization, + DecoderUtilization: deviceStatus.DecoderUtilization, + TemperatureC: deviceStatus.TemperatureC, + UsedMemoryMiB: deviceStatus.UsedMemoryMiB, + BAR1UsedMiB: deviceStatus.BAR1UsedMiB, + ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, + ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, + ECCErrorsDevice: deviceStatus.ECCErrorsDevice, + } + } + return allNvidiaGPUStats, nil +} diff --git a/plugins/device/cmd/nvidia/nvml/client_test.go b/plugins/device/cmd/nvidia/nvml/client_test.go new file mode 100644 index 000000000..23731f7b0 --- /dev/null +++ b/plugins/device/cmd/nvidia/nvml/client_test.go @@ -0,0 +1,399 @@ +package nvml + +import ( + "errors" + "testing" + + "github.com/hashicorp/nomad/helper" + "github.com/stretchr/testify/require" +) + +type MockNVMLDriver struct { + systemDriverCallSuccessful bool + deviceCountCallSuccessful bool + deviceInfoByIndexCallSuccessful bool + deviceInfoAndStatusByIndexCallSuccessful bool + driverVersion string + devices []*DeviceInfo + deviceStatus []*DeviceStatus +} + +func (m *MockNVMLDriver) Initialize() error { + return nil +} + +func (m *MockNVMLDriver) Shutdown() error { + return nil +} + +func (m *MockNVMLDriver) SystemDriverVersion() (string, error) { + if !m.systemDriverCallSuccessful { + return "", errors.New("failed to get system driver") + } + return m.driverVersion, nil +} + +func (m *MockNVMLDriver) DeviceCount() (uint, error) { + if !m.deviceCountCallSuccessful { + return 0, errors.New("failed to get device length") + } + return uint(len(m.devices)), nil +} + +func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { + if index >= uint(len(m.devices)) { + return nil, errors.New("index is out of range") + } + if !m.deviceInfoByIndexCallSuccessful { + return nil, errors.New("failed to get device info by index") + } + return m.devices[index], nil +} + +func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { + if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) { + return nil, nil, errors.New("index is out of range") + } + if !m.deviceInfoAndStatusByIndexCallSuccessful { + return nil, nil, errors.New("failed to get device info and status by index") + } + return m.devices[index], m.deviceStatus[index], nil +} + +func TestGetFingerprintDataFromNVML(t *testing.T) { + for _, testCase := range []struct { + Name string + DriverConfiguration *MockNVMLDriver + ExpectedError bool + ExpectedResult *FingerprintData + }{ + { + Name: "fail on systemDriverCallSuccessful", + ExpectedError: true, + ExpectedResult: nil, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: false, + deviceCountCallSuccessful: true, + deviceInfoByIndexCallSuccessful: true, + }, + }, + { + Name: "fail on deviceCountCallSuccessful", + ExpectedError: true, + ExpectedResult: nil, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: true, + deviceCountCallSuccessful: false, + deviceInfoByIndexCallSuccessful: true, + }, + }, + { + Name: "fail on deviceInfoByIndexCall", + ExpectedError: true, + ExpectedResult: nil, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: true, + deviceCountCallSuccessful: true, + deviceInfoByIndexCallSuccessful: false, + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: helper.StringToPtr("ModelName1"), + MemoryMiB: helper.Uint64ToPtr(16), + PCIBusID: "busId", + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(100), + PCIBandwidthMBPerS: helper.UintToPtr(100), + CoresClockMHz: helper.UintToPtr(100), + MemoryClockMHz: helper.UintToPtr(100), + }, { + UUID: "UUID2", + Name: helper.StringToPtr("ModelName2"), + MemoryMiB: helper.Uint64ToPtr(8), + PCIBusID: "busId", + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(100), + PCIBandwidthMBPerS: helper.UintToPtr(100), + CoresClockMHz: helper.UintToPtr(100), + MemoryClockMHz: helper.UintToPtr(100), + }, + }, + }, + }, + { + Name: "successful outcome", + ExpectedError: false, + ExpectedResult: &FingerprintData{ + DriverVersion: "driverVersion", + Devices: []*FingerprintDeviceData{ + { + DeviceData: &DeviceData{ + DeviceName: helper.StringToPtr("ModelName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(16), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(100), + }, + PCIBusID: "busId1", + PCIBandwidthMBPerS: helper.UintToPtr(100), + CoresClockMHz: helper.UintToPtr(100), + MemoryClockMHz: helper.UintToPtr(100), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, { + DeviceData: &DeviceData{ + DeviceName: helper.StringToPtr("ModelName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(8), + PowerW: helper.UintToPtr(200), + BAR1MiB: helper.Uint64ToPtr(200), + }, + PCIBusID: "busId2", + PCIBandwidthMBPerS: helper.UintToPtr(200), + CoresClockMHz: helper.UintToPtr(200), + MemoryClockMHz: helper.UintToPtr(200), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: true, + deviceCountCallSuccessful: true, + deviceInfoByIndexCallSuccessful: true, + driverVersion: "driverVersion", + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: helper.StringToPtr("ModelName1"), + MemoryMiB: helper.Uint64ToPtr(16), + PCIBusID: "busId1", + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(100), + PCIBandwidthMBPerS: helper.UintToPtr(100), + CoresClockMHz: helper.UintToPtr(100), + MemoryClockMHz: helper.UintToPtr(100), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, { + UUID: "UUID2", + Name: helper.StringToPtr("ModelName2"), + MemoryMiB: helper.Uint64ToPtr(8), + PCIBusID: "busId2", + PowerW: helper.UintToPtr(200), + BAR1MiB: helper.Uint64ToPtr(200), + PCIBandwidthMBPerS: helper.UintToPtr(200), + CoresClockMHz: helper.UintToPtr(200), + MemoryClockMHz: helper.UintToPtr(200), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + }, + } { + cli := nvmlClient{driver: testCase.DriverConfiguration} + fingerprintData, err := cli.GetFingerprintData() + if testCase.ExpectedError && err == nil { + t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) + } + if !testCase.ExpectedError && err != nil { + t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) + } + require.New(t).Equal(testCase.ExpectedResult, fingerprintData) + } +} + +func TestGetStatsDataFromNVML(t *testing.T) { + for _, testCase := range []struct { + Name string + DriverConfiguration *MockNVMLDriver + ExpectedError bool + ExpectedResult []*StatsData + }{ + { + Name: "fail on deviceCountCallSuccessful", + ExpectedError: true, + ExpectedResult: nil, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: true, + deviceCountCallSuccessful: false, + deviceInfoByIndexCallSuccessful: true, + deviceInfoAndStatusByIndexCallSuccessful: true, + }, + }, + { + Name: "fail on DeviceInfoAndStatusByIndex call", + ExpectedError: true, + ExpectedResult: nil, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: true, + deviceCountCallSuccessful: true, + deviceInfoAndStatusByIndexCallSuccessful: false, + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: helper.StringToPtr("ModelName1"), + MemoryMiB: helper.Uint64ToPtr(16), + PCIBusID: "busId1", + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(100), + PCIBandwidthMBPerS: helper.UintToPtr(100), + CoresClockMHz: helper.UintToPtr(100), + MemoryClockMHz: helper.UintToPtr(100), + }, { + UUID: "UUID2", + Name: helper.StringToPtr("ModelName2"), + MemoryMiB: helper.Uint64ToPtr(8), + PCIBusID: "busId2", + PowerW: helper.UintToPtr(200), + BAR1MiB: helper.Uint64ToPtr(200), + PCIBandwidthMBPerS: helper.UintToPtr(200), + CoresClockMHz: helper.UintToPtr(200), + MemoryClockMHz: helper.UintToPtr(200), + }, + }, + deviceStatus: []*DeviceStatus{ + { + TemperatureC: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(1), + ECCErrorsL2Cache: helper.Uint64ToPtr(1), + ECCErrorsDevice: helper.Uint64ToPtr(1), + PowerUsageW: helper.UintToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + }, + { + TemperatureC: helper.UintToPtr(2), + GPUUtilization: helper.UintToPtr(2), + MemoryUtilization: helper.UintToPtr(2), + EncoderUtilization: helper.UintToPtr(2), + DecoderUtilization: helper.UintToPtr(2), + UsedMemoryMiB: helper.Uint64ToPtr(2), + ECCErrorsL1Cache: helper.Uint64ToPtr(2), + ECCErrorsL2Cache: helper.Uint64ToPtr(2), + ECCErrorsDevice: helper.Uint64ToPtr(2), + PowerUsageW: helper.UintToPtr(2), + BAR1UsedMiB: helper.Uint64ToPtr(2), + }, + }, + }, + }, + { + Name: "successful outcome", + ExpectedError: false, + ExpectedResult: []*StatsData{ + { + DeviceData: &DeviceData{ + DeviceName: helper.StringToPtr("ModelName1"), + UUID: "UUID1", + MemoryMiB: helper.Uint64ToPtr(16), + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(100), + }, + TemperatureC: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(1), + ECCErrorsL2Cache: helper.Uint64ToPtr(1), + ECCErrorsDevice: helper.Uint64ToPtr(1), + PowerUsageW: helper.UintToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + }, + { + DeviceData: &DeviceData{ + DeviceName: helper.StringToPtr("ModelName2"), + UUID: "UUID2", + MemoryMiB: helper.Uint64ToPtr(8), + PowerW: helper.UintToPtr(200), + BAR1MiB: helper.Uint64ToPtr(200), + }, + TemperatureC: helper.UintToPtr(2), + GPUUtilization: helper.UintToPtr(2), + MemoryUtilization: helper.UintToPtr(2), + EncoderUtilization: helper.UintToPtr(2), + DecoderUtilization: helper.UintToPtr(2), + UsedMemoryMiB: helper.Uint64ToPtr(2), + ECCErrorsL1Cache: helper.Uint64ToPtr(2), + ECCErrorsL2Cache: helper.Uint64ToPtr(2), + ECCErrorsDevice: helper.Uint64ToPtr(2), + PowerUsageW: helper.UintToPtr(2), + BAR1UsedMiB: helper.Uint64ToPtr(2), + }, + }, + DriverConfiguration: &MockNVMLDriver{ + deviceCountCallSuccessful: true, + deviceInfoByIndexCallSuccessful: true, + deviceInfoAndStatusByIndexCallSuccessful: true, + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: helper.StringToPtr("ModelName1"), + MemoryMiB: helper.Uint64ToPtr(16), + PCIBusID: "busId1", + PowerW: helper.UintToPtr(100), + BAR1MiB: helper.Uint64ToPtr(100), + PCIBandwidthMBPerS: helper.UintToPtr(100), + CoresClockMHz: helper.UintToPtr(100), + MemoryClockMHz: helper.UintToPtr(100), + }, { + UUID: "UUID2", + Name: helper.StringToPtr("ModelName2"), + MemoryMiB: helper.Uint64ToPtr(8), + PCIBusID: "busId2", + PowerW: helper.UintToPtr(200), + BAR1MiB: helper.Uint64ToPtr(200), + PCIBandwidthMBPerS: helper.UintToPtr(200), + CoresClockMHz: helper.UintToPtr(200), + MemoryClockMHz: helper.UintToPtr(200), + }, + }, + deviceStatus: []*DeviceStatus{ + { + TemperatureC: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(1), + ECCErrorsL2Cache: helper.Uint64ToPtr(1), + ECCErrorsDevice: helper.Uint64ToPtr(1), + PowerUsageW: helper.UintToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + }, + { + TemperatureC: helper.UintToPtr(2), + GPUUtilization: helper.UintToPtr(2), + MemoryUtilization: helper.UintToPtr(2), + EncoderUtilization: helper.UintToPtr(2), + DecoderUtilization: helper.UintToPtr(2), + UsedMemoryMiB: helper.Uint64ToPtr(2), + ECCErrorsL1Cache: helper.Uint64ToPtr(2), + ECCErrorsL2Cache: helper.Uint64ToPtr(2), + ECCErrorsDevice: helper.Uint64ToPtr(2), + PowerUsageW: helper.UintToPtr(2), + BAR1UsedMiB: helper.Uint64ToPtr(2), + }, + }, + }, + }, + } { + cli := nvmlClient{driver: testCase.DriverConfiguration} + statsData, err := cli.GetStatsData() + if testCase.ExpectedError && err == nil { + t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) + } + if !testCase.ExpectedError && err != nil { + t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) + } + require.New(t).Equal(testCase.ExpectedResult, statsData) + } +} diff --git a/plugins/device/cmd/nvidia/nvml/driver.go b/plugins/device/cmd/nvidia/nvml/driver.go new file mode 100644 index 000000000..ef1ba57c4 --- /dev/null +++ b/plugins/device/cmd/nvidia/nvml/driver.go @@ -0,0 +1,138 @@ +package nvml + +import ( + "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" +) + +// DeviceInfo represents nvml device data +// this struct is returned by NvmlDriver DeviceInfoByIndex and +// DeviceInfoAndStatusByIndex methods +type DeviceInfo struct { + // The following fields are guaranteed to be retrieved from nvml + UUID string + PCIBusID string + DisplayState string + PersistenceMode string + + // The following fields can be nil after call to nvml, because nvml was + // not able to retrieve this fields for specific nvidia card + Name *string + MemoryMiB *uint64 + PowerW *uint + BAR1MiB *uint64 + PCIBandwidthMBPerS *uint + CoresClockMHz *uint + MemoryClockMHz *uint +} + +// DeviceStatus represents nvml device status +// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method +type DeviceStatus struct { + // The following fields can be nil after call to nvml, because nvml was + // not able to retrieve this fields for specific nvidia card + PowerUsageW *uint + TemperatureC *uint + GPUUtilization *uint // % + MemoryUtilization *uint // % + EncoderUtilization *uint // % + DecoderUtilization *uint // % + BAR1UsedMiB *uint64 + UsedMemoryMiB *uint64 + ECCErrorsL1Cache *uint64 + ECCErrorsL2Cache *uint64 + ECCErrorsDevice *uint64 +} + +// NvmlDriver represents set of methods to query nvml library +type NvmlDriver interface { + Initialize() error + Shutdown() error + SystemDriverVersion() (string, error) + DeviceCount() (uint, error) + DeviceInfoByIndex(uint) (*DeviceInfo, error) + DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) +} + +// nvmlDriver implements NvmlDriver +// Users are required to call Initialize method before using any other methods +type nvmlDriver struct{} + +// Initialize nvml library by locating nvml shared object file and calling ldopen +func (n *nvmlDriver) Initialize() error { + return nvml.Init() +} + +// Shutdown stops any further interaction with nvml +func (n *nvmlDriver) Shutdown() error { + return nvml.Shutdown() +} + +// SystemDriverVersion returns installed driver version +func (n *nvmlDriver) SystemDriverVersion() (string, error) { + return nvml.GetDriverVersion() +} + +// DeviceCount reports number of available GPU devices +func (n *nvmlDriver) DeviceCount() (uint, error) { + return nvml.GetDeviceCount() +} + +// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list +func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { + device, err := nvml.NewDevice(index) + if err != nil { + return nil, err + } + deviceMode, err := device.GetDeviceMode() + if err != nil { + return nil, err + } + return &DeviceInfo{ + UUID: device.UUID, + Name: device.Model, + MemoryMiB: device.Memory, + PowerW: device.Power, + BAR1MiB: device.PCI.BAR1, + PCIBandwidthMBPerS: device.PCI.Bandwidth, + PCIBusID: device.PCI.BusID, + CoresClockMHz: device.Clocks.Cores, + MemoryClockMHz: device.Clocks.Memory, + DisplayState: deviceMode.DisplayInfo.Mode.String(), + PersistenceMode: deviceMode.Persistence.String(), + }, nil +} + +// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list +func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { + device, err := nvml.NewDevice(index) + if err != nil { + return nil, nil, err + } + status, err := device.Status() + if err != nil { + return nil, nil, err + } + return &DeviceInfo{ + UUID: device.UUID, + Name: device.Model, + MemoryMiB: device.Memory, + PowerW: device.Power, + BAR1MiB: device.PCI.BAR1, + PCIBandwidthMBPerS: device.PCI.Bandwidth, + PCIBusID: device.PCI.BusID, + CoresClockMHz: device.Clocks.Cores, + MemoryClockMHz: device.Clocks.Memory, + }, &DeviceStatus{ + TemperatureC: status.Temperature, + GPUUtilization: status.Utilization.GPU, + MemoryUtilization: status.Utilization.Memory, + EncoderUtilization: status.Utilization.Encoder, + DecoderUtilization: status.Utilization.Decoder, + UsedMemoryMiB: status.Memory.Global.Used, + ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache, + ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache, + ECCErrorsDevice: status.Memory.ECCErrors.Device, + PowerUsageW: status.Power, + BAR1UsedMiB: status.PCI.BAR1Used, + }, nil +} diff --git a/plugins/device/cmd/nvidia/stats.go b/plugins/device/cmd/nvidia/stats.go new file mode 100644 index 000000000..022c710fc --- /dev/null +++ b/plugins/device/cmd/nvidia/stats.go @@ -0,0 +1,301 @@ +package nvidia + +import ( + "context" + "time" + + "github.com/hashicorp/nomad/plugins/device" + "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" +) + +const ( + // Attribute names for reporting stats output + PowerUsageAttr = "Power usage" + PowerUsageUnit = "W" + PowerUsageDesc = "Power usage for this GPU in watts and " + + "its associated circuitry (e.g. memory) / Maximum GPU Power" + GPUUtilizationAttr = "GPU utilization" + GPUUtilizationUnit = "%" + GPUUtilizationDesc = "Percent of time over the past sample period " + + "during which one or more kernels were executing on the GPU." + MemoryUtilizationAttr = "Memory utilization" + MemoryUtilizationUnit = "%" + MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period" + EncoderUtilizationAttr = "Encoder utilization" + EncoderUtilizationUnit = "%" + EncoderUtilizationDesc = "Percent of time over the past sample period " + + "during which GPU Encoder was used" + DecoderUtilizationAttr = "Decoder utilization" + DecoderUtilizationUnit = "%" + DecoderUtilizationDesc = "Percent of time over the past sample period " + + "during which GPU Decoder was used" + TemperatureAttr = "Temperature" + TemperatureUnit = "C" // Celsius degrees + TemperatureDesc = "Temperature of the Unit" + MemoryStateAttr = "Memory state" + MemoryStateUnit = "MiB" // Mebibytes + MemoryStateDesc = "UsedMemory / TotalMemory" + BAR1StateAttr = "BAR1 buffer state" + BAR1StateUnit = "MiB" // Mebibytes + BAR1StateDesc = "UsedBAR1 / TotalBAR1" + ECCErrorsL1CacheAttr = "ECC L1 errors" + ECCErrorsL1CacheUnit = "#" // number of errors + ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device" + ECCErrorsL2CacheAttr = "ECC L2 errors" + ECCErrorsL2CacheUnit = "#" // number of errors + ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device" + ECCErrorsDeviceAttr = "ECC memory errors" + ECCErrorsDeviceUnit = "#" // number of errors + ECCErrorsDeviceDesc = "Requested memory error counter for the device" +) + +// stats is the long running goroutine that streams device statistics +func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) { + defer close(stats) + + if d.nvmlClientInitializationError != nil { + d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError) + return + } + + // Create a timer that will fire immediately for the first detection + ticker := time.NewTimer(0) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + ticker.Reset(d.statsPeriod) + } + + d.writeStatsToChannel(stats, time.Now()) + } +} + +// filterStatsByID accepts list of StatsData and set of IDs +// this function would return entries from StatsData with IDs found in the set +func filterStatsByID(stats []*nvml.StatsData, IDs map[string]struct{}) []*nvml.StatsData { + var filteredStats []*nvml.StatsData + for _, statsItem := range stats { + if _, ok := IDs[statsItem.UUID]; ok { + filteredStats = append(filteredStats, statsItem) + } + } + return filteredStats +} + +// writeStatsToChannel collects StatsData from NVML backend, groups StatsData +// by DeviceName attribute, populates DeviceGroupStats structure for every group +// and sends data over provided channel +func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) { + statsData, err := d.nvmlClient.GetStatsData() + if err != nil { + d.logger.Error("failed to get nvidia stats", "error", err) + stats <- &device.StatsResponse{ + Error: err, + } + return + } + + // filter only stats from devices that are stored in NvidiaDevice struct + d.deviceLock.RLock() + statsData = filterStatsByID(statsData, d.devices) + d.deviceLock.RUnlock() + + // group stats by DeviceName struct field + statsListByDeviceName := make(map[string][]*nvml.StatsData) + for _, statsItem := range statsData { + deviceName := statsItem.DeviceName + if deviceName == nil { + // nvml driver was not able to detect device name. This kind + // of devices are placed to single group with 'notAvailable' name + notAvailableCopy := notAvailable + deviceName = ¬AvailableCopy + } + + statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem) + } + + // place data device.DeviceGroupStats struct for every group of stats + deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName)) + for groupName, groupStats := range statsListByDeviceName { + deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp)) + } + + stats <- &device.StatsResponse{ + Groups: deviceGroupsStats, + } +} + +func newNotAvailableDeviceStats(unit, desc string) *device.StatValue { + return &device.StatValue{Unit: unit, Desc: desc, StringVal: notAvailable} +} + +// statsForGroup is a helper function that populates device.DeviceGroupStats +// for given groupName with groupStats list +func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats { + instanceStats := make(map[string]*device.DeviceStats) + for _, statsItem := range groupStats { + instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp) + } + + return &device.DeviceGroupStats{ + Vendor: vendor, + Type: deviceType, + Name: groupName, + InstanceStats: instanceStats, + } +} + +// statsForItem is a helper function that populates device.DeviceStats for given +// nvml.StatsData +func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats { + // nvml.StatsData holds pointers to values that can be nil + // In case they are nil return stats with 'notAvailable' constant + var ( + powerUsageStat *device.StatValue + GPUUtilizationStat *device.StatValue + memoryUtilizationStat *device.StatValue + encoderUtilizationStat *device.StatValue + decoderUtilizationStat *device.StatValue + temperatureStat *device.StatValue + memoryStateStat *device.StatValue + BAR1StateStat *device.StatValue + ECCErrorsL1CacheStat *device.StatValue + ECCErrorsL2CacheStat *device.StatValue + ECCErrorsDeviceStat *device.StatValue + ) + + if statsItem.PowerUsageW == nil || statsItem.PowerW == nil { + powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc) + } else { + powerUsageStat = &device.StatValue{ + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: int64(*statsItem.PowerUsageW), + IntDenominatorVal: int64(*statsItem.PowerW), + } + } + + if statsItem.GPUUtilization == nil { + GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc) + } else { + GPUUtilizationStat = &device.StatValue{ + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: int64(*statsItem.GPUUtilization), + } + } + + if statsItem.MemoryUtilization == nil { + memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc) + } else { + memoryUtilizationStat = &device.StatValue{ + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: int64(*statsItem.MemoryUtilization), + } + } + + if statsItem.EncoderUtilization == nil { + encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc) + } else { + encoderUtilizationStat = &device.StatValue{ + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: int64(*statsItem.EncoderUtilization), + } + } + + if statsItem.DecoderUtilization == nil { + decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc) + } else { + decoderUtilizationStat = &device.StatValue{ + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: int64(*statsItem.DecoderUtilization), + } + } + + if statsItem.TemperatureC == nil { + temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc) + } else { + temperatureStat = &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: int64(*statsItem.TemperatureC), + } + } + + if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil { + memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc) + } else { + memoryStateStat = &device.StatValue{ + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: int64(*statsItem.UsedMemoryMiB), + IntDenominatorVal: int64(*statsItem.MemoryMiB), + } + } + + if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil { + BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc) + } else { + BAR1StateStat = &device.StatValue{ + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: int64(*statsItem.BAR1UsedMiB), + IntDenominatorVal: int64(*statsItem.BAR1MiB), + } + } + + if statsItem.ECCErrorsL1Cache == nil { + ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc) + } else { + ECCErrorsL1CacheStat = &device.StatValue{ + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: int64(*statsItem.ECCErrorsL1Cache), + } + } + + if statsItem.ECCErrorsL2Cache == nil { + ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc) + } else { + ECCErrorsL2CacheStat = &device.StatValue{ + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: int64(*statsItem.ECCErrorsL2Cache), + } + } + + if statsItem.ECCErrorsDevice == nil { + ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc) + } else { + ECCErrorsDeviceStat = &device.StatValue{ + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: int64(*statsItem.ECCErrorsDevice), + } + } + return &device.DeviceStats{ + Summary: temperatureStat, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: powerUsageStat, + GPUUtilizationAttr: GPUUtilizationStat, + MemoryUtilizationAttr: memoryUtilizationStat, + EncoderUtilizationAttr: encoderUtilizationStat, + DecoderUtilizationAttr: decoderUtilizationStat, + TemperatureAttr: temperatureStat, + MemoryStateAttr: memoryStateStat, + BAR1StateAttr: BAR1StateStat, + ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat, + ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat, + ECCErrorsDeviceAttr: ECCErrorsDeviceStat, + }, + }, + Timestamp: timestamp, + } +} diff --git a/plugins/device/cmd/nvidia/stats_test.go b/plugins/device/cmd/nvidia/stats_test.go new file mode 100644 index 000000000..d60eb88da --- /dev/null +++ b/plugins/device/cmd/nvidia/stats_test.go @@ -0,0 +1,3016 @@ +package nvidia + +import ( + "errors" + "sort" + "testing" + "time" + + hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/plugins/device" + "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" + "github.com/stretchr/testify/require" +) + +func TestFilterStatsByID(t *testing.T) { + for _, testCase := range []struct { + Name string + ProvidedStats []*nvml.StatsData + ProvidedIDs map[string]struct{} + ExpectedResult []*nvml.StatsData + }{ + { + Name: "All ids are in the map", + ProvidedStats: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + }, + ProvidedIDs: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + ExpectedResult: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + }, + }, + { + Name: "Odd are not provided in the map", + ProvidedStats: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + }, + ProvidedIDs: map[string]struct{}{ + "UUID2": {}, + }, + ExpectedResult: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + }, + }, + { + Name: "Even are not provided in the map", + ProvidedStats: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + }, + ProvidedIDs: map[string]struct{}{ + "UUID1": {}, + "UUID3": {}, + }, + ExpectedResult: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + }, + }, + { + Name: "No Stats were provided", + ProvidedIDs: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + }, + { + Name: "No Ids were provided", + ProvidedStats: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + }, + }, + } { + actualResult := filterStatsByID(testCase.ProvidedStats, testCase.ProvidedIDs) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + } +} + +func TestStatsForItem(t *testing.T) { + for _, testCase := range []struct { + Name string + Timestamp time.Time + ItemStat *nvml.StatsData + ExpectedResult *device.DeviceStats + }{ + { + Name: "All fields in ItemStat are not nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "Power usage is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: nil, + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + StringVal: notAvailable, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "PowerW is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: nil, + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + StringVal: notAvailable, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "GPUUtilization is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: nil, + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + StringVal: notAvailable, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "MemoryUtilization is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: nil, + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + StringVal: notAvailable, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "EncoderUtilization is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: nil, + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + StringVal: notAvailable, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "DecoderUtilization is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: nil, + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + StringVal: notAvailable, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "Temperature is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: nil, + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + StringVal: notAvailable, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + StringVal: notAvailable, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "UsedMemoryMiB is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: nil, + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + StringVal: notAvailable, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "MemoryMiB is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: nil, + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + StringVal: notAvailable, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "BAR1UsedMiB is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: nil, + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + StringVal: notAvailable, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "BAR1MiB is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: nil, + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + StringVal: notAvailable, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "ECCErrorsL1Cache is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: nil, + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + StringVal: notAvailable, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "ECCErrorsL2Cache is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: nil, + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + StringVal: notAvailable, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + { + Name: "ECCErrorsDevice is nil", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ItemStat: &nvml.StatsData{ + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: nil, + }, + ExpectedResult: &device.DeviceStats{ + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + StringVal: notAvailable, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + } { + actualResult := statsForItem(testCase.ItemStat, testCase.Timestamp) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + } +} + +func TestStatsForGroup(t *testing.T) { + for _, testCase := range []struct { + Name string + Timestamp time.Time + GroupStats []*nvml.StatsData + GroupName string + ExpectedResult *device.DeviceGroupStats + }{ + { + Name: "make sure that all data is transformed correctly", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + GroupName: "DeviceName1", + GroupStats: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName2"), + MemoryMiB: helper.Uint64ToPtr(2), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(2), + GPUUtilization: helper.UintToPtr(2), + MemoryUtilization: helper.UintToPtr(2), + EncoderUtilization: helper.UintToPtr(2), + DecoderUtilization: helper.UintToPtr(2), + TemperatureC: helper.UintToPtr(2), + UsedMemoryMiB: helper.Uint64ToPtr(2), + BAR1UsedMiB: helper.Uint64ToPtr(2), + ECCErrorsL1Cache: helper.Uint64ToPtr(200), + ECCErrorsL2Cache: helper.Uint64ToPtr(200), + ECCErrorsDevice: helper.Uint64ToPtr(200), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName3"), + MemoryMiB: helper.Uint64ToPtr(3), + PowerW: helper.UintToPtr(3), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(3), + GPUUtilization: helper.UintToPtr(3), + MemoryUtilization: helper.UintToPtr(3), + EncoderUtilization: helper.UintToPtr(3), + DecoderUtilization: helper.UintToPtr(3), + TemperatureC: helper.UintToPtr(3), + UsedMemoryMiB: helper.Uint64ToPtr(3), + BAR1UsedMiB: helper.Uint64ToPtr(3), + ECCErrorsL1Cache: helper.Uint64ToPtr(300), + ECCErrorsL2Cache: helper.Uint64ToPtr(300), + ECCErrorsDevice: helper.Uint64ToPtr(300), + }, + }, + ExpectedResult: &device.DeviceGroupStats{ + Vendor: vendor, + Type: deviceType, + Name: "DeviceName1", + InstanceStats: map[string]*device.DeviceStats{ + "UUID1": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + "UUID2": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 2, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 2, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 2, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 2, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 200, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + "UUID3": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 3, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 3, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 3, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 3, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 3, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 3, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 3, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 3, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 300, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 300, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 300, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + }, + } { + actualResult := statsForGroup(testCase.GroupName, testCase.GroupStats, testCase.Timestamp) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + } +} + +func TestWriteStatsToChannel(t *testing.T) { + for _, testCase := range []struct { + Name string + ExpectedWriteToChannel *device.StatsResponse + Timestamp time.Time + Device *NvidiaDevice + }{ + { + Name: "NVML wrapper returns error", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + ExpectedWriteToChannel: &device.StatsResponse{ + Error: errors.New(""), + }, + Device: &NvidiaDevice{ + nvmlClient: &MockNvmlClient{ + StatsError: errors.New(""), + }, + logger: hclog.NewNullLogger(), + }, + }, + { + Name: "Check that stats with multiple DeviceNames are assigned to different groups", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + nvmlClient: &MockNvmlClient{ + StatsResponseReturned: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName2"), + MemoryMiB: helper.Uint64ToPtr(2), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(2), + GPUUtilization: helper.UintToPtr(2), + MemoryUtilization: helper.UintToPtr(2), + EncoderUtilization: helper.UintToPtr(2), + DecoderUtilization: helper.UintToPtr(2), + TemperatureC: helper.UintToPtr(2), + UsedMemoryMiB: helper.Uint64ToPtr(2), + BAR1UsedMiB: helper.Uint64ToPtr(2), + ECCErrorsL1Cache: helper.Uint64ToPtr(200), + ECCErrorsL2Cache: helper.Uint64ToPtr(200), + ECCErrorsDevice: helper.Uint64ToPtr(200), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName3"), + MemoryMiB: helper.Uint64ToPtr(3), + PowerW: helper.UintToPtr(3), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(3), + GPUUtilization: helper.UintToPtr(3), + MemoryUtilization: helper.UintToPtr(3), + EncoderUtilization: helper.UintToPtr(3), + DecoderUtilization: helper.UintToPtr(3), + TemperatureC: helper.UintToPtr(3), + UsedMemoryMiB: helper.Uint64ToPtr(3), + BAR1UsedMiB: helper.Uint64ToPtr(3), + ECCErrorsL1Cache: helper.Uint64ToPtr(300), + ECCErrorsL2Cache: helper.Uint64ToPtr(300), + ECCErrorsDevice: helper.Uint64ToPtr(300), + }, + }, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.StatsResponse{ + Groups: []*device.DeviceGroupStats{ + { + Vendor: vendor, + Type: deviceType, + Name: "DeviceName1", + InstanceStats: map[string]*device.DeviceStats{ + "UUID1": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + { + Vendor: vendor, + Type: deviceType, + Name: "DeviceName2", + InstanceStats: map[string]*device.DeviceStats{ + "UUID2": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 2, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 2, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 2, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 2, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 200, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + { + Vendor: vendor, + Type: deviceType, + Name: "DeviceName3", + InstanceStats: map[string]*device.DeviceStats{ + "UUID3": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 3, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 3, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 3, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 3, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 3, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 3, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 3, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 3, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 300, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 300, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 300, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + }, + }, + }, + { + Name: "Check that stats with multiple DeviceNames are assigned to different groups 2", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + nvmlClient: &MockNvmlClient{ + StatsResponseReturned: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName2"), + MemoryMiB: helper.Uint64ToPtr(2), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(2), + GPUUtilization: helper.UintToPtr(2), + MemoryUtilization: helper.UintToPtr(2), + EncoderUtilization: helper.UintToPtr(2), + DecoderUtilization: helper.UintToPtr(2), + TemperatureC: helper.UintToPtr(2), + UsedMemoryMiB: helper.Uint64ToPtr(2), + BAR1UsedMiB: helper.Uint64ToPtr(2), + ECCErrorsL1Cache: helper.Uint64ToPtr(200), + ECCErrorsL2Cache: helper.Uint64ToPtr(200), + ECCErrorsDevice: helper.Uint64ToPtr(200), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName2"), + MemoryMiB: helper.Uint64ToPtr(3), + PowerW: helper.UintToPtr(3), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(3), + GPUUtilization: helper.UintToPtr(3), + MemoryUtilization: helper.UintToPtr(3), + EncoderUtilization: helper.UintToPtr(3), + DecoderUtilization: helper.UintToPtr(3), + TemperatureC: helper.UintToPtr(3), + UsedMemoryMiB: helper.Uint64ToPtr(3), + BAR1UsedMiB: helper.Uint64ToPtr(3), + ECCErrorsL1Cache: helper.Uint64ToPtr(300), + ECCErrorsL2Cache: helper.Uint64ToPtr(300), + ECCErrorsDevice: helper.Uint64ToPtr(300), + }, + }, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.StatsResponse{ + Groups: []*device.DeviceGroupStats{ + { + Vendor: vendor, + Type: deviceType, + Name: "DeviceName1", + InstanceStats: map[string]*device.DeviceStats{ + "UUID1": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + { + Vendor: vendor, + Type: deviceType, + Name: "DeviceName2", + InstanceStats: map[string]*device.DeviceStats{ + "UUID3": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 3, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 3, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 3, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 3, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 3, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 3, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 3, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 3, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 3, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 300, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 300, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 300, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + "UUID2": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 2, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 2, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 2, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 2, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 200, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + }, + }, + }, + { + Name: "Check that only devices from NvidiaDevice.device map stats are reported", + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + }, + nvmlClient: &MockNvmlClient{ + StatsResponseReturned: []*nvml.StatsData{ + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID1", + DeviceName: helper.StringToPtr("DeviceName1"), + MemoryMiB: helper.Uint64ToPtr(1), + PowerW: helper.UintToPtr(1), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(1), + GPUUtilization: helper.UintToPtr(1), + MemoryUtilization: helper.UintToPtr(1), + EncoderUtilization: helper.UintToPtr(1), + DecoderUtilization: helper.UintToPtr(1), + TemperatureC: helper.UintToPtr(1), + UsedMemoryMiB: helper.Uint64ToPtr(1), + BAR1UsedMiB: helper.Uint64ToPtr(1), + ECCErrorsL1Cache: helper.Uint64ToPtr(100), + ECCErrorsL2Cache: helper.Uint64ToPtr(100), + ECCErrorsDevice: helper.Uint64ToPtr(100), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID2", + DeviceName: helper.StringToPtr("DeviceName2"), + MemoryMiB: helper.Uint64ToPtr(2), + PowerW: helper.UintToPtr(2), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(2), + GPUUtilization: helper.UintToPtr(2), + MemoryUtilization: helper.UintToPtr(2), + EncoderUtilization: helper.UintToPtr(2), + DecoderUtilization: helper.UintToPtr(2), + TemperatureC: helper.UintToPtr(2), + UsedMemoryMiB: helper.Uint64ToPtr(2), + BAR1UsedMiB: helper.Uint64ToPtr(2), + ECCErrorsL1Cache: helper.Uint64ToPtr(200), + ECCErrorsL2Cache: helper.Uint64ToPtr(200), + ECCErrorsDevice: helper.Uint64ToPtr(200), + }, + { + DeviceData: &nvml.DeviceData{ + UUID: "UUID3", + DeviceName: helper.StringToPtr("DeviceName3"), + MemoryMiB: helper.Uint64ToPtr(3), + PowerW: helper.UintToPtr(3), + BAR1MiB: helper.Uint64ToPtr(256), + }, + PowerUsageW: helper.UintToPtr(3), + GPUUtilization: helper.UintToPtr(3), + MemoryUtilization: helper.UintToPtr(3), + EncoderUtilization: helper.UintToPtr(3), + DecoderUtilization: helper.UintToPtr(3), + TemperatureC: helper.UintToPtr(3), + UsedMemoryMiB: helper.Uint64ToPtr(3), + BAR1UsedMiB: helper.Uint64ToPtr(3), + ECCErrorsL1Cache: helper.Uint64ToPtr(300), + ECCErrorsL2Cache: helper.Uint64ToPtr(300), + ECCErrorsDevice: helper.Uint64ToPtr(300), + }, + }, + }, + logger: hclog.NewNullLogger(), + }, + ExpectedWriteToChannel: &device.StatsResponse{ + Groups: []*device.DeviceGroupStats{ + { + Vendor: vendor, + Type: deviceType, + Name: "DeviceName1", + InstanceStats: map[string]*device.DeviceStats{ + "UUID1": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 1, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 1, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 1, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 1, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 1, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 1, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 1, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 100, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 100, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + { + Vendor: vendor, + Type: deviceType, + Name: "DeviceName2", + InstanceStats: map[string]*device.DeviceStats{ + "UUID2": { + Summary: &device.StatValue{ + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + Stats: &device.StatObject{ + Attributes: map[string]*device.StatValue{ + PowerUsageAttr: { + Unit: PowerUsageUnit, + Desc: PowerUsageDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + GPUUtilizationAttr: { + Unit: GPUUtilizationUnit, + Desc: GPUUtilizationDesc, + IntNumeratorVal: 2, + }, + MemoryUtilizationAttr: { + Unit: MemoryUtilizationUnit, + Desc: MemoryUtilizationDesc, + IntNumeratorVal: 2, + }, + EncoderUtilizationAttr: { + Unit: EncoderUtilizationUnit, + Desc: EncoderUtilizationDesc, + IntNumeratorVal: 2, + }, + DecoderUtilizationAttr: { + Unit: DecoderUtilizationUnit, + Desc: DecoderUtilizationDesc, + IntNumeratorVal: 2, + }, + TemperatureAttr: { + Unit: TemperatureUnit, + Desc: TemperatureDesc, + IntNumeratorVal: 2, + }, + MemoryStateAttr: { + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 2, + }, + BAR1StateAttr: { + Unit: BAR1StateUnit, + Desc: BAR1StateDesc, + IntNumeratorVal: 2, + IntDenominatorVal: 256, + }, + ECCErrorsL1CacheAttr: { + Unit: ECCErrorsL1CacheUnit, + Desc: ECCErrorsL1CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsL2CacheAttr: { + Unit: ECCErrorsL2CacheUnit, + Desc: ECCErrorsL2CacheDesc, + IntNumeratorVal: 200, + }, + ECCErrorsDeviceAttr: { + Unit: ECCErrorsDeviceUnit, + Desc: ECCErrorsDeviceDesc, + IntNumeratorVal: 200, + }, + }, + }, + Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), + }, + }, + }, + }, + }, + }, + } { + channel := make(chan *device.StatsResponse, 1) + testCase.Device.writeStatsToChannel(channel, testCase.Timestamp) + actualResult := <-channel + // writeStatsToChannel iterates over map keys + // and insterts results to an array, so order of elements in output array + // may be different + // actualResult, expectedWriteToChannel arrays has to be sorted firsted + sort.Slice(actualResult.Groups, func(i, j int) bool { + return actualResult.Groups[i].Name < actualResult.Groups[j].Name + }) + sort.Slice(testCase.ExpectedWriteToChannel.Groups, func(i, j int) bool { + return testCase.ExpectedWriteToChannel.Groups[i].Name < testCase.ExpectedWriteToChannel.Groups[j].Name + }) + require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) + } +} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA b/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA new file mode 100644 index 000000000..1001ecb5f --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA @@ -0,0 +1,160 @@ + GPU Monitoring Tools + Software Grant and Corporate Contributor License Agreement ("Agreement") + + Thank you for your interest in the gpu-monitoring-tools Project (the + "Project"). In order to clarify the intellectual property license + granted with Contributions from any person or entity, NVIDIA + Corporation (the “Copyright Holders") must have a Contributor License + Agreement (CLA) on file that has been signed by each Contributor, + indicating agreement to the license terms below. This license is + for your protection as a Contributor as well as the protection of the + Project and its users; it does not change your rights to use your own + Contributions for any other purpose. + + This version of the Agreement allows an entity (the "Corporation") to + submit Contributions to the Project, to authorize Contributions + submitted by its designated employees to the Project, and to grant + copyright and patent licenses thereto to the Copyright Holders. + + If you have not already done so, please complete and sign, then scan and + email a pdf file of this Agreement to digits@nvidia.com. + Please read this document carefully before signing and keep a copy for + your records. + + Corporation name: ________________________________________________ + + Corporation address: ________________________________________________ + + ________________________________________________ + + ________________________________________________ + + Point of Contact: ________________________________________________ + + E-Mail: ________________________________________________ + + Telephone: _____________________ Fax: _____________________ + + + You accept and agree to the following terms and conditions for Your + present and future Contributions submitted to the Project. In + return, the Copyright Holders shall not use Your Contributions in a way + that is contrary to the public benefit or inconsistent with its nonprofit + status and bylaws in effect at the time of the Contribution. Except + for the license granted herein to the Copyright Holders and recipients of + software distributed by the Copyright Holders, You reserve all right, title, + and interest in and to Your Contributions. + + 1. Definitions. + + "You" (or "Your") shall mean the copyright owner or legal entity + authorized by the copyright owner that is making this Agreement + with the Copyright Holders. For legal entities, the entity making a + Contribution and all other entities that control, are controlled by, + or are under common control with that entity are considered to be a + single Contributor. For the purposes of this definition, "control" + means (i) the power, direct or indirect, to cause the direction or + management of such entity, whether by contract or otherwise, or + (ii) ownership of fifty percent (50%) or more of the outstanding + shares, or (iii) beneficial ownership of such entity. + + "Contribution" shall mean the code, documentation or other original + works of authorship expressly identified in Schedule B, as well as + any original work of authorship, including + any modifications or additions to an existing work, that is intentionally + submitted by You to the Copyright Holders for inclusion in, or + documentation of, any of the products owned or managed by the + Copyright Holders (the "Work"). For the purposes of this definition, + "submitted" means any form of electronic, verbal, or written + communication sent to the Copyright Holders or its representatives, + including but not limited to communication on electronic mailing + lists, source code control systems, and issue tracking systems + that are managed by, or on behalf of, the Copyright Holders for the + purpose of discussing and improving the Work, but excluding + communication that is conspicuously marked or otherwise designated + in writing by You as "Not a Contribution." + + 2. Grant of Copyright License. Subject to the terms and conditions + of this Agreement, You hereby grant to the Copyright Holders and to + recipients of software distributed by the Copyright Holders a + perpetual, worldwide, non-exclusive, no-charge, royalty-free, + irrevocable copyright license to reproduce, prepare derivative works + of, publicly display, publicly perform, sublicense, and distribute + Your Contributions and such derivative works. + + 3. Grant of Patent License. Subject to the terms and conditions of + this Agreement, You hereby grant to the Copyright Holders and to + recipients of software distributed by the Copyright Holders + a perpetual, worldwide, non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license + to make, have made, use, offer to sell, sell, import, and otherwise + transfer the Work, where such license applies only to those + patent claims licensable by You that are necessarily infringed + by Your Contribution(s) alone or by combination of Your Contribution(s) + with the Work to which such Contribution(s) were submitted. + If any entity institutes patent litigation against You or any + other entity (including a cross-claim or counterclaim in a lawsuit) + alleging that your Contribution, or the Work to which you have + contributed, constitutes direct or contributory patent infringement, + then any patent licenses granted to that entity under this Agreement + for that Contribution or Work shall terminate as of the date such + litigation is filed. + + 4. You represent that You are legally entitled to grant the above + license. You represent further that each employee of the + Corporation designated on Schedule A below (or in a subsequent + written modification to that Schedule) is authorized to submit + Contributions on behalf of the Corporation. + + 5. You represent that each of Your Contributions is Your original + creation (see section 7 for submissions on behalf of others). + + 6. You are not expected to provide support for Your Contributions, + except to the extent You desire to provide support. You may provide + support for free, for a fee, or not at all. Unless required by + applicable law or agreed to in writing, You provide Your + Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied, including, without + limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, + MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. + + 7. Should You wish to submit work that is not Your original creation, + You may submit it to the Copyright Holders separately from any + Contribution, identifying the complete details of its source and + of any license or other restriction (including, but not limited + to, related patents, trademarks, and license agreements) of which + you are personally aware, and conspicuously marking the work as + "Submitted on behalf of a third-party: [named here]". + + 8. It is your responsibility to notify the Copyright Holders when any change + is required to the list of designated employees authorized to submit + Contributions on behalf of the Corporation, or to the Corporation's + Point of Contact with the Copyright Holders. + + + + Please sign: __________________________________ Date: _______________ + + Title: __________________________________ + + Corporation: __________________________________ + + + + +Schedule A + + [Initial list of designated employees. NB: authorization is not + tied to particular Contributions.] + + + + + + +Schedule B + + [Identification of optional concurrent software grant. Would be + left blank or omitted if there is no concurrent software grant.] + + diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE new file mode 100644 index 000000000..2a718d63d --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2018, NVIDIA Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md b/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md new file mode 100644 index 000000000..58d90402e --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md @@ -0,0 +1,34 @@ +# NVIDIA GPU Monitoring Tools + +## NVML Go Bindings + +[NVIDIA Management Library (NVML)](https://developer.nvidia.com/nvidia-management-library-nvml) is a C-based API for monitoring and managing NVIDIA GPU devices. +NVML go bindings are taken from [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) with some improvements and additions. NVML headers are also added to the package to make it easy to use and build. + +### NVML Samples +Three [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/nvml/README.md) are included to demonstrate how to use the NVML API. + + +## DCGM Go Bindings + +[NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. + +DCGM go bindings makes administering and monitoring containerized GPU applications easy. + +### DCGM Samples + +DCGM can be run in different modes, seven [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/README.md) and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) are included for showing how to use the DCGM API and run it in different modes. + + +## DCGM exporter + +GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a simple shell script that starts nv-hostengine, reads GPU metrics every 1 second and converts it to a standard Prometheus format. + +Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md). + +## Issues and Contributing + +A signed copy of the [Contributor License Agreement](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/CLA) needs to be provided to digits@nvidia.com before any change can be accepted. + +* Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new) +* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/) diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go new file mode 100644 index 000000000..4bba89834 --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go @@ -0,0 +1,634 @@ +// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + +package nvml + +// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files +// #include "nvml_dl.h" +import "C" + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "sort" + "strconv" + "strings" +) + +const ( + szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE + szName = C.NVML_DEVICE_NAME_BUFFER_SIZE + szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE + szProcs = 32 + szProcName = 64 + + XidCriticalError = C.nvmlEventTypeXidCriticalError +) + +type handle struct{ dev C.nvmlDevice_t } +type EventSet struct{ set C.nvmlEventSet_t } +type Event struct { + UUID *string + Etype uint64 + Edata uint64 +} + +func uintPtr(c C.uint) *uint { + i := uint(c) + return &i +} + +func uint64Ptr(c C.ulonglong) *uint64 { + i := uint64(c) + return &i +} + +func stringPtr(c *C.char) *string { + s := C.GoString(c) + return &s +} + +func errorString(ret C.nvmlReturn_t) error { + if ret == C.NVML_SUCCESS { + return nil + } + err := C.GoString(C.nvmlErrorString(ret)) + return fmt.Errorf("nvml: %v", err) +} + +func init_() error { + r := C.nvmlInit_dl() + if r == C.NVML_ERROR_LIBRARY_NOT_FOUND { + return errors.New("could not load NVML library") + } + return errorString(r) +} + +func NewEventSet() EventSet { + var set C.nvmlEventSet_t + C.nvmlEventSetCreate(&set) + + return EventSet{set} +} + +func RegisterEvent(es EventSet, event int) error { + n, err := deviceGetCount() + if err != nil { + return err + } + + var i uint + for i = 0; i < n; i++ { + h, err := deviceGetHandleByIndex(i) + if err != nil { + return err + } + + r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) + if r != C.NVML_SUCCESS { + return errorString(r) + } + } + + return nil +} + +func RegisterEventForDevice(es EventSet, event int, uuid string) error { + n, err := deviceGetCount() + if err != nil { + return err + } + + var i uint + for i = 0; i < n; i++ { + h, err := deviceGetHandleByIndex(i) + if err != nil { + return err + } + + duuid, err := h.deviceGetUUID() + if err != nil { + return err + } + + if *duuid != uuid { + continue + } + + r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) + if r != C.NVML_SUCCESS { + return errorString(r) + } + + return nil + } + + return fmt.Errorf("nvml: device not found") +} + +func DeleteEventSet(es EventSet) { + C.nvmlEventSetFree(es.set) +} + +func WaitForEvent(es EventSet, timeout uint) (Event, error) { + var data C.nvmlEventData_t + + r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout)) + uuid, _ := handle{data.device}.deviceGetUUID() + + return Event{ + UUID: uuid, + Etype: uint64(data.eventType), + Edata: uint64(data.eventData), + }, + errorString(r) +} + +func shutdown() error { + return errorString(C.nvmlShutdown_dl()) +} + +func systemGetDriverVersion() (string, error) { + var driver [szDriver]C.char + + r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver) + return C.GoString(&driver[0]), errorString(r) +} + +func systemGetProcessName(pid uint) (string, error) { + var proc [szProcName]C.char + + r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName) + return C.GoString(&proc[0]), errorString(r) +} + +func deviceGetCount() (uint, error) { + var n C.uint + + r := C.nvmlDeviceGetCount(&n) + return uint(n), errorString(r) +} + +func deviceGetHandleByIndex(idx uint) (handle, error) { + var dev C.nvmlDevice_t + + r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev) + return handle{dev}, errorString(r) +} + +func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) { + var level C.nvmlGpuTopologyLevel_t + + r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level) + if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(C.uint(level)), errorString(r) +} + +func (h handle) deviceGetName() (*string, error) { + var name [szName]C.char + + r := C.nvmlDeviceGetName(h.dev, &name[0], szName) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return stringPtr(&name[0]), errorString(r) +} + +func (h handle) deviceGetUUID() (*string, error) { + var uuid [szUUID]C.char + + r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return stringPtr(&uuid[0]), errorString(r) +} + +func (h handle) deviceGetPciInfo() (*string, error) { + var pci C.nvmlPciInfo_t + + r := C.nvmlDeviceGetPciInfo(h.dev, &pci) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return stringPtr(&pci.busId[0]), errorString(r) +} + +func (h handle) deviceGetMinorNumber() (*uint, error) { + var minor C.uint + + r := C.nvmlDeviceGetMinorNumber(h.dev, &minor) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(minor), errorString(r) +} + +func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) { + var bar1 C.nvmlBAR1Memory_t + + r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil + } + return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r) +} + +func (h handle) deviceGetPowerManagementLimit() (*uint, error) { + var power C.uint + + r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(power), errorString(r) +} + +func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) { + var sm, mem C.uint + + r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil + } + if r == C.NVML_SUCCESS { + r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) + } + return uintPtr(sm), uintPtr(mem), errorString(r) +} + +func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) { + var link C.uint + + r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(link), errorString(r) +} + +func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) { + var width C.uint + + r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(width), errorString(r) +} + +func (h handle) deviceGetPowerUsage() (*uint, error) { + var power C.uint + + r := C.nvmlDeviceGetPowerUsage(h.dev, &power) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(power), errorString(r) +} + +func (h handle) deviceGetTemperature() (*uint, error) { + var temp C.uint + + r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(temp), errorString(r) +} + +func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) { + var usage C.nvmlUtilization_t + + r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil + } + return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r) +} + +func (h handle) deviceGetEncoderUtilization() (*uint, error) { + var usage, sampling C.uint + + r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(usage), errorString(r) +} + +func (h handle) deviceGetDecoderUtilization() (*uint, error) { + var usage, sampling C.uint + + r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(usage), errorString(r) +} + +func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) { + var mem C.nvmlMemory_t + + r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return + } + + err = errorString(r) + if r != C.NVML_SUCCESS { + return + } + + totalMem = uint64Ptr(mem.total) + if totalMem != nil { + *totalMem /= 1024 * 1024 // MiB + } + + devMem = DeviceMemory{ + Used: uint64Ptr(mem.used), + Free: uint64Ptr(mem.free), + } + + if devMem.Used != nil { + *devMem.Used /= 1024 * 1024 // MiB + } + + if devMem.Free != nil { + *devMem.Free /= 1024 * 1024 // MiB + } + return +} + +func (h handle) deviceGetClockInfo() (*uint, *uint, error) { + var sm, mem C.uint + + r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil + } + if r == C.NVML_SUCCESS { + r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) + } + return uintPtr(sm), uintPtr(mem), errorString(r) +} + +func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) { + var l1, l2, mem C.ulonglong + + r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, + C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil, nil + } + if r == C.NVML_SUCCESS { + r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, + C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2) + } + if r == C.NVML_SUCCESS { + r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, + C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem) + } + return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r) +} + +func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) { + var rx, tx C.uint + + r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil + } + if r == C.NVML_SUCCESS { + r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx) + } + return uintPtr(rx), uintPtr(tx), errorString(r) +} + +func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) { + var procs [szProcs]C.nvmlProcessInfo_t + var count = C.uint(szProcs) + + r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0]) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil + } + n := int(count) + pids := make([]uint, n) + mems := make([]uint64, n) + for i := 0; i < n; i++ { + pids[i] = uint(procs[i].pid) + mems[i] = uint64(procs[i].usedGpuMemory) + } + return pids, mems, errorString(r) +} + +func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) { + var procs [szProcs]C.nvmlProcessInfo_t + var count = C.uint(szProcs) + + r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0]) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil, nil + } + n := int(count) + pids := make([]uint, n) + mems := make([]uint64, n) + for i := 0; i < n; i++ { + pids[i] = uint(procs[i].pid) + mems[i] = uint64(procs[i].usedGpuMemory) + } + return pids, mems, errorString(r) +} + +func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) { + cPids, cpMems, err := h.deviceGetComputeRunningProcesses() + if err != nil { + return nil, err + } + + gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses() + if err != nil { + return nil, err + } + + allPids := make(map[uint]ProcessInfo) + + for i, pid := range cPids { + name, err := processName(pid) + if err != nil { + return nil, err + } + allPids[pid] = ProcessInfo{ + PID: pid, + Name: name, + MemoryUsed: cpMems[i] / (1024 * 1024), // MiB + Type: Compute, + } + + } + + for i, pid := range gPids { + pInfo, exists := allPids[pid] + if exists { + pInfo.Type = ComputeAndGraphics + allPids[pid] = pInfo + } else { + name, err := processName(pid) + if err != nil { + return nil, err + } + allPids[pid] = ProcessInfo{ + PID: pid, + Name: name, + MemoryUsed: gpMems[i] / (1024 * 1024), // MiB + Type: Graphics, + } + } + } + + var processInfo []ProcessInfo + for _, v := range allPids { + processInfo = append(processInfo, v) + } + sort.Slice(processInfo, func(i, j int) bool { + return processInfo[i].PID < processInfo[j].PID + }) + + return processInfo, nil +} + +func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) { + var clocksThrottleReasons C.ulonglong + + r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons) + + if r == C.NVML_ERROR_NOT_SUPPORTED { + return ThrottleReasonUnknown, nil + } + + if r != C.NVML_SUCCESS { + return ThrottleReasonUnknown, errorString(r) + } + + switch clocksThrottleReasons { + case C.nvmlClocksThrottleReasonGpuIdle: + reason = ThrottleReasonGpuIdle + case C.nvmlClocksThrottleReasonApplicationsClocksSetting: + reason = ThrottleReasonApplicationsClocksSetting + case C.nvmlClocksThrottleReasonSwPowerCap: + reason = ThrottleReasonSwPowerCap + case C.nvmlClocksThrottleReasonHwSlowdown: + reason = ThrottleReasonHwSlowdown + case C.nvmlClocksThrottleReasonSyncBoost: + reason = ThrottleReasonSyncBoost + case C.nvmlClocksThrottleReasonSwThermalSlowdown: + reason = ThrottleReasonSwThermalSlowdown + case C.nvmlClocksThrottleReasonHwThermalSlowdown: + reason = ThrottleReasonHwThermalSlowdown + case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown: + reason = ThrottleReasonHwPowerBrakeSlowdown + case C.nvmlClocksThrottleReasonDisplayClockSetting: + reason = ThrottleReasonDisplayClockSetting + case C.nvmlClocksThrottleReasonNone: + reason = ThrottleReasonNone + } + return +} + +func (h handle) getPerformanceState() (PerfState, error) { + var pstate C.nvmlPstates_t + + r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate) + + if r == C.NVML_ERROR_NOT_SUPPORTED { + return PerfStateUnknown, nil + } + + if r != C.NVML_SUCCESS { + return PerfStateUnknown, errorString(r) + } + return PerfState(pstate), nil +} + +func processName(pid uint) (string, error) { + f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm` + d, err := ioutil.ReadFile(f) + + if err != nil { + // TOCTOU: process terminated + if os.IsNotExist(err) { + return "", nil + } + return "", err + } + return strings.TrimSuffix(string(d), "\n"), err +} + +func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) { + var mode C.nvmlEnableState_t + var buffer C.uint + + r := C.nvmlDeviceGetAccountingMode(h.dev, &mode) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return + } + + if r != C.NVML_SUCCESS { + return accountingInfo, errorString(r) + } + + r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return + } + + if r != C.NVML_SUCCESS { + return accountingInfo, errorString(r) + } + + accountingInfo = Accounting{ + Mode: ModeState(mode), + BufferSize: uintPtr(buffer), + } + return +} + +func (h handle) getDisplayInfo() (display Display, err error) { + var mode, isActive C.nvmlEnableState_t + + r := C.nvmlDeviceGetDisplayActive(h.dev, &mode) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return + } + + if r != C.NVML_SUCCESS { + return display, errorString(r) + } + + r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return + } + if r != C.NVML_SUCCESS { + return display, errorString(r) + } + display = Display{ + Mode: ModeState(mode), + Active: ModeState(isActive), + } + return +} + +func (h handle) getPeristenceMode() (state ModeState, err error) { + var mode C.nvmlEnableState_t + + r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return + } + return ModeState(mode), errorString(r) +} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go new file mode 100644 index 000000000..f6ec9e8fa --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go @@ -0,0 +1,533 @@ +// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + +package nvml + +// #include "nvml_dl.h" +import "C" + +import ( + "bytes" + "errors" + "fmt" + "io/ioutil" + "strconv" + "strings" +) + +var ( + ErrCPUAffinity = errors.New("failed to retrieve CPU affinity") + ErrUnsupportedP2PLink = errors.New("unsupported P2P link type") + ErrUnsupportedGPU = errors.New("unsupported GPU device") +) + +type ModeState uint + +const ( + Enabled ModeState = iota + Disabled +) + +func (m ModeState) String() string { + switch m { + case Enabled: + return "Enabled" + case Disabled: + return "Disabled" + } + return "N/A" +} + +type Display struct { + Mode ModeState + Active ModeState +} + +type Accounting struct { + Mode ModeState + BufferSize *uint +} + +type DeviceMode struct { + DisplayInfo Display + Persistence ModeState + AccountingInfo Accounting +} + +type ThrottleReason uint + +const ( + ThrottleReasonGpuIdle ThrottleReason = iota + ThrottleReasonApplicationsClocksSetting + ThrottleReasonSwPowerCap + ThrottleReasonHwSlowdown + ThrottleReasonSyncBoost + ThrottleReasonSwThermalSlowdown + ThrottleReasonHwThermalSlowdown + ThrottleReasonHwPowerBrakeSlowdown + ThrottleReasonDisplayClockSetting + ThrottleReasonNone + ThrottleReasonUnknown +) + +func (r ThrottleReason) String() string { + switch r { + case ThrottleReasonGpuIdle: + return "Gpu Idle" + case ThrottleReasonApplicationsClocksSetting: + return "Applications Clocks Setting" + case ThrottleReasonSwPowerCap: + return "SW Power Cap" + case ThrottleReasonHwSlowdown: + return "HW Slowdown" + case ThrottleReasonSyncBoost: + return "Sync Boost" + case ThrottleReasonSwThermalSlowdown: + return "SW Thermal Slowdown" + case ThrottleReasonHwThermalSlowdown: + return "HW Thermal Slowdown" + case ThrottleReasonHwPowerBrakeSlowdown: + return "HW Power Brake Slowdown" + case ThrottleReasonDisplayClockSetting: + return "Display Clock Setting" + case ThrottleReasonNone: + return "No clocks throttling" + } + return "N/A" +} + +type PerfState uint + +const ( + PerfStateMax = 0 + PerfStateMin = 15 + PerfStateUnknown = 32 +) + +func (p PerfState) String() string { + if p >= PerfStateMax && p <= PerfStateMin { + return fmt.Sprintf("P%d", p) + } + return "Unknown" +} + +type ProcessType uint + +const ( + Compute ProcessType = iota + Graphics + ComputeAndGraphics +) + +func (t ProcessType) String() string { + typ := "C+G" + if t == Compute { + typ = "C" + } else if t == Graphics { + typ = "G" + } + return typ +} + +type P2PLinkType uint + +const ( + P2PLinkUnknown P2PLinkType = iota + P2PLinkCrossCPU + P2PLinkSameCPU + P2PLinkHostBridge + P2PLinkMultiSwitch + P2PLinkSingleSwitch + P2PLinkSameBoard +) + +type P2PLink struct { + BusID string + Link P2PLinkType +} + +func (t P2PLinkType) String() string { + switch t { + case P2PLinkCrossCPU: + return "Cross CPU socket" + case P2PLinkSameCPU: + return "Same CPU socket" + case P2PLinkHostBridge: + return "Host PCI bridge" + case P2PLinkMultiSwitch: + return "Multiple PCI switches" + case P2PLinkSingleSwitch: + return "Single PCI switch" + case P2PLinkSameBoard: + return "Same board" + case P2PLinkUnknown: + } + return "N/A" +} + +type ClockInfo struct { + Cores *uint + Memory *uint +} + +type PCIInfo struct { + BusID string + BAR1 *uint64 + Bandwidth *uint +} + +type Device struct { + handle + + UUID string + Path string + Model *string + Power *uint + Memory *uint64 + CPUAffinity *uint + PCI PCIInfo + Clocks ClockInfo + Topology []P2PLink +} + +type UtilizationInfo struct { + GPU *uint + Memory *uint + Encoder *uint + Decoder *uint +} + +type PCIThroughputInfo struct { + RX *uint + TX *uint +} + +type PCIStatusInfo struct { + BAR1Used *uint64 + Throughput PCIThroughputInfo +} + +type ECCErrorsInfo struct { + L1Cache *uint64 + L2Cache *uint64 + Device *uint64 +} + +type DeviceMemory struct { + Used *uint64 + Free *uint64 +} + +type MemoryInfo struct { + Global DeviceMemory + ECCErrors ECCErrorsInfo +} + +type ProcessInfo struct { + PID uint + Name string + MemoryUsed uint64 + Type ProcessType +} + +type DeviceStatus struct { + Power *uint + Temperature *uint + Utilization UtilizationInfo + Memory MemoryInfo + Clocks ClockInfo + PCI PCIStatusInfo + Processes []ProcessInfo + Throttle ThrottleReason + Performance PerfState +} + +func assert(err error) { + if err != nil { + panic(err) + } +} + +func Init() error { + return init_() +} + +func Shutdown() error { + return shutdown() +} + +func GetDeviceCount() (uint, error) { + return deviceGetCount() +} + +func GetDriverVersion() (string, error) { + return systemGetDriverVersion() +} + +func numaNode(busid string) (uint, error) { + // discard leading zeros of busid + b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:]))) + if err != nil { + // XXX report node 0 if NUMA support isn't enabled + return 0, nil + } + node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8) + if err != nil { + return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err) + } + if node < 0 { + node = 0 // XXX report node 0 instead of NUMA_NO_NODE + } + return uint(node), nil +} + +func pciBandwidth(gen, width *uint) *uint { + m := map[uint]uint{ + 1: 250, // MB/s + 2: 500, + 3: 985, + 4: 1969, + } + if gen == nil || width == nil { + return nil + } + bw := m[*gen] * *width + return &bw +} + +func NewDevice(idx uint) (device *Device, err error) { + defer func() { + if r := recover(); r != nil { + err = r.(error) + } + }() + + h, err := deviceGetHandleByIndex(idx) + assert(err) + model, err := h.deviceGetName() + assert(err) + uuid, err := h.deviceGetUUID() + assert(err) + minor, err := h.deviceGetMinorNumber() + assert(err) + power, err := h.deviceGetPowerManagementLimit() + assert(err) + totalMem, _, err := h.deviceGetMemoryInfo() + assert(err) + busid, err := h.deviceGetPciInfo() + assert(err) + bar1, _, err := h.deviceGetBAR1MemoryInfo() + assert(err) + pcig, err := h.deviceGetMaxPcieLinkGeneration() + assert(err) + pciw, err := h.deviceGetMaxPcieLinkWidth() + assert(err) + ccore, cmem, err := h.deviceGetMaxClockInfo() + assert(err) + + if minor == nil || busid == nil || uuid == nil { + return nil, ErrUnsupportedGPU + } + path := fmt.Sprintf("/dev/nvidia%d", *minor) + node, err := numaNode(*busid) + assert(err) + + device = &Device{ + handle: h, + UUID: *uuid, + Path: path, + Model: model, + Power: power, + Memory: totalMem, + CPUAffinity: &node, + PCI: PCIInfo{ + BusID: *busid, + BAR1: bar1, + Bandwidth: pciBandwidth(pcig, pciw), // MB/s + }, + Clocks: ClockInfo{ + Cores: ccore, // MHz + Memory: cmem, // MHz + }, + } + if power != nil { + *device.Power /= 1000 // W + } + if bar1 != nil { + *device.PCI.BAR1 /= 1024 * 1024 // MiB + } + return +} + +func NewDeviceLite(idx uint) (device *Device, err error) { + defer func() { + if r := recover(); r != nil { + err = r.(error) + } + }() + + h, err := deviceGetHandleByIndex(idx) + assert(err) + uuid, err := h.deviceGetUUID() + assert(err) + minor, err := h.deviceGetMinorNumber() + assert(err) + busid, err := h.deviceGetPciInfo() + assert(err) + + if minor == nil || busid == nil || uuid == nil { + return nil, ErrUnsupportedGPU + } + path := fmt.Sprintf("/dev/nvidia%d", *minor) + + device = &Device{ + handle: h, + UUID: *uuid, + Path: path, + PCI: PCIInfo{ + BusID: *busid, + }, + } + return +} + +func (d *Device) Status() (status *DeviceStatus, err error) { + defer func() { + if r := recover(); r != nil { + err = r.(error) + } + }() + + power, err := d.deviceGetPowerUsage() + assert(err) + temp, err := d.deviceGetTemperature() + assert(err) + ugpu, umem, err := d.deviceGetUtilizationRates() + assert(err) + uenc, err := d.deviceGetEncoderUtilization() + assert(err) + udec, err := d.deviceGetDecoderUtilization() + assert(err) + _, devMem, err := d.deviceGetMemoryInfo() + assert(err) + ccore, cmem, err := d.deviceGetClockInfo() + assert(err) + _, bar1, err := d.deviceGetBAR1MemoryInfo() + assert(err) + el1, el2, emem, err := d.deviceGetMemoryErrorCounter() + assert(err) + pcirx, pcitx, err := d.deviceGetPcieThroughput() + assert(err) + throttle, err := d.getClocksThrottleReasons() + assert(err) + perfState, err := d.getPerformanceState() + assert(err) + processInfo, err := d.deviceGetAllRunningProcesses() + assert(err) + + status = &DeviceStatus{ + Power: power, + Temperature: temp, // °C + Utilization: UtilizationInfo{ + GPU: ugpu, // % + Memory: umem, // % + Encoder: uenc, // % + Decoder: udec, // % + }, + Memory: MemoryInfo{ + Global: devMem, + ECCErrors: ECCErrorsInfo{ + L1Cache: el1, + L2Cache: el2, + Device: emem, + }, + }, + Clocks: ClockInfo{ + Cores: ccore, // MHz + Memory: cmem, // MHz + }, + PCI: PCIStatusInfo{ + BAR1Used: bar1, + Throughput: PCIThroughputInfo{ + RX: pcirx, + TX: pcitx, + }, + }, + Throttle: throttle, + Performance: perfState, + Processes: processInfo, + } + if power != nil { + *status.Power /= 1000 // W + } + if bar1 != nil { + *status.PCI.BAR1Used /= 1024 * 1024 // MiB + } + if pcirx != nil { + *status.PCI.Throughput.RX /= 1000 // MB/s + } + if pcitx != nil { + *status.PCI.Throughput.TX /= 1000 // MB/s + } + return +} + +func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) { + level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle) + if err != nil || level == nil { + return P2PLinkUnknown, err + } + + switch *level { + case C.NVML_TOPOLOGY_INTERNAL: + link = P2PLinkSameBoard + case C.NVML_TOPOLOGY_SINGLE: + link = P2PLinkSingleSwitch + case C.NVML_TOPOLOGY_MULTIPLE: + link = P2PLinkMultiSwitch + case C.NVML_TOPOLOGY_HOSTBRIDGE: + link = P2PLinkHostBridge + case C.NVML_TOPOLOGY_CPU: + link = P2PLinkSameCPU + case C.NVML_TOPOLOGY_SYSTEM: + link = P2PLinkCrossCPU + default: + err = ErrUnsupportedP2PLink + } + return +} + +func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) { + return d.handle.deviceGetComputeRunningProcesses() +} + +func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) { + return d.handle.deviceGetGraphicsRunningProcesses() +} + +func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) { + return d.handle.deviceGetAllRunningProcesses() +} + +func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) { + defer func() { + if r := recover(); r != nil { + err = r.(error) + } + }() + + display, err := d.getDisplayInfo() + assert(err) + + p, err := d.getPeristenceMode() + assert(err) + + accounting, err := d.getAccountingInfo() + assert(err) + + mode = &DeviceMode{ + DisplayInfo: display, + Persistence: p, + AccountingInfo: accounting, + } + return +} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h new file mode 100644 index 000000000..60185dac2 --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h @@ -0,0 +1,5871 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +/* +NVML API Reference + +The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and +managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building +3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi +tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. + +API Documentation + +Supported platforms: +- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit +- Linux: 32-bit and 64-bit +- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 + +Supported products: +- Full Support + - All Tesla products, starting with the Fermi architecture + - All Quadro products, starting with the Fermi architecture + - All GRID products, starting with the Kepler architecture + - Selected GeForce Titan products +- Limited Support + - All Geforce products, starting with the Fermi architecture + +The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is +not be added to the system path by default. To dynamically link to NVML, add this path to the PATH +environmental variable. To dynamically load NVML, call LoadLibrary with this path. + +On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit +and 64 bit NVML libraries will be installed. + +Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html +*/ + +#ifndef __nvml_nvml_h__ +#define __nvml_nvml_h__ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On Windows, set up methods for DLL export + * define NVML_STATIC_IMPORT when using nvml_loader library + */ +#if defined _WINDOWS + #if !defined NVML_STATIC_IMPORT + #if defined NVML_LIB_EXPORT + #define DECLDIR __declspec(dllexport) + #else + #define DECLDIR __declspec(dllimport) + #endif + #else + #define DECLDIR + #endif +#else + #define DECLDIR +#endif + +/** + * NVML API versioning support + */ +#define NVML_API_VERSION 9 +#define NVML_API_VERSION_STR "9" +#define nvmlInit nvmlInit_v2 +#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 +#define nvmlDeviceGetCount nvmlDeviceGetCount_v2 +#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 +#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 +#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 +#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceStructs Device Structs + * @{ + */ +/***************************************************************************************************/ + +/** + * Special constant that some fields take when they are not available. + * Used when only part of the struct is not available. + * + * Each structure explicitly states when to check for this value. + */ +#define NVML_VALUE_NOT_AVAILABLE (-1) + +typedef struct nvmlDevice_st* nvmlDevice_t; + +/** + * Buffer size guaranteed to be large enough for pci bus id + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 + +/** + * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 + +/** + * PCI information about a GPU device. + */ +typedef struct nvmlPciInfo_st +{ + char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) + unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff + unsigned int bus; //!< The bus on which the device resides, 0 to 0xff + unsigned int device; //!< The device's id on the bus, 0 to 31 + unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id + + // Added in NVML 2.285 API + unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID + + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) +} nvmlPciInfo_t; + +/** + * Detailed ECC error counts for a device. + * + * @deprecated Different GPU families can have different memory error counters + * See \ref nvmlDeviceGetMemoryErrorCounter + */ +typedef struct nvmlEccErrorCounts_st +{ + unsigned long long l1Cache; //!< L1 cache errors + unsigned long long l2Cache; //!< L2 cache errors + unsigned long long deviceMemory; //!< Device memory errors + unsigned long long registerFile; //!< Register file errors +} nvmlEccErrorCounts_t; + +/** + * Utilization information for a device. + * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. + */ +typedef struct nvmlUtilization_st +{ + unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU + unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written +} nvmlUtilization_t; + +/** + * Memory allocation information for a device. + */ +typedef struct nvmlMemory_st +{ + unsigned long long total; //!< Total installed FB memory (in bytes) + unsigned long long free; //!< Unallocated FB memory (in bytes) + unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping +} nvmlMemory_t; + +/** + * BAR1 Memory allocation Information for a device + */ +typedef struct nvmlBAR1Memory_st +{ + unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) + unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) + unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) +}nvmlBAR1Memory_t; + +/** + * Information about running compute processes on the GPU + */ +typedef struct nvmlProcessInfo_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver +} nvmlProcessInfo_t; + +/** + * Enum to represent type of bridge chip + */ +typedef enum nvmlBridgeChipType_enum +{ + NVML_BRIDGE_CHIP_PLX = 0, + NVML_BRIDGE_CHIP_BRO4 = 1 +}nvmlBridgeChipType_t; + +/** + * Maximum number of NvLink links supported + */ +#define NVML_NVLINK_MAX_LINKS 6 + +/** + * Enum to represent the NvLink utilization counter packet units + */ +typedef enum nvmlNvLinkUtilizationCountUnits_enum +{ + NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles + NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets + NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes + + // this must be last + NVML_NVLINK_COUNTER_UNIT_COUNT +} nvmlNvLinkUtilizationCountUnits_t; + +/** + * Enum to represent the NvLink utilization counter packet types to count + * ** this is ONLY applicable with the units as packets or bytes + * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t + * ** all packet filter descriptions are target GPU centric + * ** these can be "OR'd" together + */ +typedef enum nvmlNvLinkUtilizationCountPktTypes_enum +{ + NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets + NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets + NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets + NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests + NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data + NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data + NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets +} nvmlNvLinkUtilizationCountPktTypes_t; + +/** + * Struct to define the NVLINK counter controls + */ +typedef struct nvmlNvLinkUtilizationControl_st +{ + nvmlNvLinkUtilizationCountUnits_t units; + nvmlNvLinkUtilizationCountPktTypes_t pktfilter; +} nvmlNvLinkUtilizationControl_t; + +/** + * Enum to represent NvLink queryable capabilities + */ +typedef enum nvmlNvLinkCapability_enum +{ + NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported + NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported + NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported + NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported + NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link + NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device + // should be last + NVML_NVLINK_CAP_COUNT +} nvmlNvLinkCapability_t; + +/** + * Enum to represent NvLink queryable error counters + */ +typedef enum nvmlNvLinkErrorCounter_enum +{ + NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter + NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter + NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter + NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter + + // this must be last + NVML_NVLINK_ERROR_COUNT +} nvmlNvLinkErrorCounter_t; + +/** + * Represents level relationships within a system between two GPUs + * The enums are spaced to allow for future relationships + */ +typedef enum nvmlGpuLevel_enum +{ + NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 + NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch + NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge + NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge + NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges + NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system + + // there is purposefully no COUNT here because of the need for spacing above +} nvmlGpuTopologyLevel_t; + +/* Compatibility for CPU->NODE renaming */ +#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE + +/* P2P Capability Index Status*/ +typedef enum nvmlGpuP2PStatus_enum +{ + NVML_P2P_STATUS_OK = 0, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_GPU_NOT_SUPPORTED, + NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, + NVML_P2P_STATUS_DISABLED_BY_REGKEY, + NVML_P2P_STATUS_NOT_SUPPORTED, + NVML_P2P_STATUS_UNKNOWN + +} nvmlGpuP2PStatus_t; + +/* P2P Capability Index*/ +typedef enum nvmlGpuP2PCapsIndex_enum +{ + NVML_P2P_CAPS_INDEX_READ = 0, + NVML_P2P_CAPS_INDEX_WRITE, + NVML_P2P_CAPS_INDEX_NVLINK, + NVML_P2P_CAPS_INDEX_ATOMICS, + NVML_P2P_CAPS_INDEX_PROP, + NVML_P2P_CAPS_INDEX_UNKNOWN +}nvmlGpuP2PCapsIndex_t; + +/** + * Maximum limit on Physical Bridges per Board + */ +#define NVML_MAX_PHYSICAL_BRIDGE (128) + +/** + * Information about the Bridge Chip Firmware + */ +typedef struct nvmlBridgeChipInfo_st +{ + nvmlBridgeChipType_t type; //!< Type of Bridge Chip + unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable +}nvmlBridgeChipInfo_t; + +/** + * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate + * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. + */ +typedef struct nvmlBridgeChipHierarchy_st +{ + unsigned char bridgeCount; //!< Number of Bridge Chips on the Board + nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board +}nvmlBridgeChipHierarchy_t; + +/** + * Represents Type of Sampling Event + */ +typedef enum nvmlSamplingType_enum +{ + NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU + NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU + NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written + NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy + NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy + NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples + NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples + + // Keep this last + NVML_SAMPLINGTYPE_COUNT +}nvmlSamplingType_t; + +/** + * Represents the queryable PCIe utilization counters + */ +typedef enum nvmlPcieUtilCounter_enum +{ + NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity + NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity + + // Keep this last + NVML_PCIE_UTIL_COUNT +} nvmlPcieUtilCounter_t; + +/** + * Represents the type for sample value returned + */ +typedef enum nvmlValueType_enum +{ + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + + // Keep this last + NVML_VALUE_TYPE_COUNT +}nvmlValueType_t; + + +/** + * Union to represent different types of Value + */ +typedef union nvmlValue_st +{ + double dVal; //!< If the value is double + unsigned int uiVal; //!< If the value is unsigned int + unsigned long ulVal; //!< If the value is unsigned long + unsigned long long ullVal; //!< If the value is unsigned long long + signed long long sllVal; //!< If the value is signed long long +}nvmlValue_t; + +/** + * Information for Sample + */ +typedef struct nvmlSample_st +{ + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t sampleValue; //!< Sample Value +}nvmlSample_t; + +/** + * Represents type of perf policy for which violation times can be queried + */ +typedef enum nvmlPerfPolicyType_enum +{ + NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks + NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks + NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks + NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks + NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks + NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks + + NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) + NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks + + // Keep this last + NVML_PERF_POLICY_COUNT +}nvmlPerfPolicyType_t; + +/** + * Struct to hold perf policy violation status data + */ +typedef struct nvmlViolationTime_st +{ + unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds + unsigned long long violationTime; //!< violationTime in Nanoseconds +}nvmlViolationTime_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceEnumvs Device Enums + * @{ + */ +/***************************************************************************************************/ + +/** + * Generic enable/disable enum. + */ +typedef enum nvmlEnableState_enum +{ + NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_ENABLED = 1 //!< Feature enabled +} nvmlEnableState_t; + +//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. +#define nvmlFlagDefault 0x00 +//! Generic flag used to force some behavior. See description of particular functions for details. +#define nvmlFlagForce 0x01 + +/** + * * The Brand of the GPU + * */ +typedef enum nvmlBrandType_enum +{ + NVML_BRAND_UNKNOWN = 0, + NVML_BRAND_QUADRO = 1, + NVML_BRAND_TESLA = 2, + NVML_BRAND_NVS = 3, + NVML_BRAND_GRID = 4, + NVML_BRAND_GEFORCE = 5, + NVML_BRAND_TITAN = 6, + + // Keep this last + NVML_BRAND_COUNT +} nvmlBrandType_t; + +/** + * Temperature thresholds. + */ +typedef enum nvmlTemperatureThresholds_enum +{ + NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down + // for HW protection + NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown + NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown + NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock + // Keep this last + NVML_TEMPERATURE_THRESHOLD_COUNT +} nvmlTemperatureThresholds_t; + +/** + * Temperature sensors. + */ +typedef enum nvmlTemperatureSensors_enum +{ + NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die + + // Keep this last + NVML_TEMPERATURE_COUNT +} nvmlTemperatureSensors_t; + +/** + * Compute mode. + * + * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. + * Earlier CUDA versions supported a single exclusive mode, + * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. + */ +typedef enum nvmlComputeMode_enum +{ + NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed + NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time + + // Keep this last + NVML_COMPUTEMODE_COUNT +} nvmlComputeMode_t; + +/** + * ECC bit types. + * + * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type + */ +#define nvmlEccBitType_t nvmlMemoryErrorType_t + +/** + * Single bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED + */ +#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED + +/** + * Double bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED + */ +#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED + +/** + * Memory error types + */ +typedef enum nvmlMemoryErrorType_enum +{ + /** + * A memory error that was corrected + * + * For ECC errors, these are single bit errors + * For Texture memory, these are errors fixed by resend + */ + NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, + /** + * A memory error that was not corrected + * + * For ECC errors, these are double bit errors + * For Texture memory, these are errors where the resend fails + */ + NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, + + + // Keep this last + NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types + +} nvmlMemoryErrorType_t; + +/** + * ECC counter types. + * + * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. + * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver + * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app + * is run. + */ +typedef enum nvmlEccCounterType_enum +{ + NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. + NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) + + // Keep this last + NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types +} nvmlEccCounterType_t; + +/** + * Clock types. + * + * All speeds are in Mhz. + */ +typedef enum nvmlClockType_enum +{ + NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain + NVML_CLOCK_SM = 1, //!< SM clock domain + NVML_CLOCK_MEM = 2, //!< Memory clock domain + NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain + + // Keep this last + NVML_CLOCK_COUNT //usedGpuMemory is not supported + + + unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if + //!< the process is not terminated + + unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process + + unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) + + unsigned int reserved[5]; //!< Reserved for future use +} nvmlAccountingStats_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpuConstants Vgpu Constants + * @{ + */ +/***************************************************************************************************/ + +/** + * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense + */ +#define NVML_GRID_LICENSE_BUFFER_SIZE 128 + +#define NVML_VGPU_NAME_BUFFER_SIZE 64 + +#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 + +/*! + * Macros for pGPU's virtualization capabilities bitfield. + */ +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpuEnum Vgpu Enum + * @{ + */ +/***************************************************************************************************/ + +/*! + * Types of VM identifiers + */ +typedef enum nvmlVgpuVmIdType { + NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID + NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID +} nvmlVgpuVmIdType_t; + +// vGPU GUEST info state. +typedef enum nvmlVgpuGuestInfoState_enum +{ + NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //= 0 and < \a unitCount + * @param unit Reference in which to return the unit handle + * + * @return + * - \ref NVML_SUCCESS if \a unit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); + +/** + * Retrieves the static information associated with a unit. + * + * For S-class products. + * + * See \ref nvmlUnitInfo_t for details on available unit info. + * + * @param unit The identifier of the target unit + * @param info Reference in which to return the unit information + * + * @return + * - \ref NVML_SUCCESS if \a info has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL + */ +nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); + +/** + * Retrieves the LED state associated with this unit. + * + * For S-class products. + * + * See \ref nvmlLedState_t for details on allowed states. + * + * @param unit The identifier of the target unit + * @param state Reference in which to return the current LED state + * + * @return + * - \ref NVML_SUCCESS if \a state has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitSetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); + +/** + * Retrieves the PSU stats for the unit. + * + * For S-class products. + * + * See \ref nvmlPSUInfo_t for details on available PSU info. + * + * @param unit The identifier of the target unit + * @param psu Reference in which to return the PSU information + * + * @return + * - \ref NVML_SUCCESS if \a psu has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); + +/** + * Retrieves the temperature readings for the unit, in degrees C. + * + * For S-class products. + * + * Depending on the product, readings may be available for intake (type=0), + * exhaust (type=1) and board (type=2). + * + * @param unit The identifier of the target unit + * @param type The type of reading to take + * @param temp Reference in which to return the intake temperature + * + * @return + * - \ref NVML_SUCCESS if \a temp has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); + +/** + * Retrieves the fan speed readings for the unit. + * + * For S-class products. + * + * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. + * + * @param unit The identifier of the target unit + * @param fanSpeeds Reference in which to return the fan speed information + * + * @return + * - \ref NVML_SUCCESS if \a fanSpeeds has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); + +/** + * Retrieves the set of GPU devices that are attached to the specified unit. + * + * For S-class products. + * + * The \a deviceCount argument is expected to be set to the size of the input \a devices array. + * + * @param unit The identifier of the target unit + * @param deviceCount Reference in which to provide the \a devices array size, and + * to return the number of attached GPU devices + * @param devices Reference in which to return the references to the attached GPU devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); + +/** + * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. + * + * For S-class products. + * + * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. + * The HIC must be connected to an S-class system for it to be reported by this function. + * + * @param hwbcCount Size of hwbcEntries array + * @param hwbcEntries Array holding information about hwbc + * + * @return + * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceQueries Device Queries + * This chapter describes that queries that NVML can perform against each device. + * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by + * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(), + * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). + * @{ + */ +/***************************************************************************************************/ + + /** + * Retrieves the number of compute devices in the system. A compute device is a single GPU. + * + * For all products. + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * @param deviceCount Reference in which to return the number of accessible devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); + +/** + * Acquire the handle for a particular device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or UUID. See + * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId(). + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. + * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't + * need to worry about that. + * + * @param index The index of the target GPU, >= 0 and < \a accessibleDevices + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetIndex + * @see nvmlDeviceGetCount + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its board serial number. + * + * For Fermi &tm; or newer fully supported devices. + * + * This number corresponds to the value printed directly on the board, and to the value returned by + * \ref nvmlDeviceGetSerial(). + * + * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor + * of \ref nvmlDeviceGetHandleByUUID. + * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @param serial The board serial number of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one + * device has the same serial (dual GPU boards) + * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetSerial + * @see nvmlDeviceGetHandleByUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. + * + * For all products. + * + * @param uuid The UUID of the target GPU + * @param device Reference in which to return the device handle + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null + * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its PCI bus id. + * + * For all products. + * + * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo(). + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND + * instead of NVML_ERROR_NO_PERMISSION. + * + * @param pciBusId The PCI bus id of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); + +/** + * Retrieves the name of this device. + * + * For all products. + * + * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not + * exceed 64 characters in length (including the NULL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param name Reference in which to return the product name + * @param length The maximum allowed length of the string returned in \a name + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); + +/** + * Retrieves the brand of this device. + * + * For all products. + * + * The type is a member of \ref nvmlBrandType_t defined above. + * + * @param device The identifier of the target device + * @param type Reference in which to return the product brand type + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); + +/** + * Retrieves the NVML index of this device. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or GPU UUID. See + * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID(). + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * @param device The identifier of the target device + * @param index Reference in which to return the NVML index of the device + * + * @return + * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetHandleByIndex() + * @see nvmlDeviceGetCount() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + +/** + * Retrieves the globally unique board serial number associated with this device's board. + * + * For all products with an inforom. + * + * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). + * This number matches the serial number tag that is physically attached to the board. See \ref + * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param serial Reference in which to return the board/module serial number + * @param length The maximum allowed length of the string returned in \a serial + * + * @return + * - \ref NVML_SUCCESS if \a serial has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); + +/** + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); + +/** + * Sets the ideal affinity for the calling thread and device using the guidelines + * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. + * Older versions set the affinity for a calling process and all children. + * Currently supports up to 64 processors. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully bound + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); + +/** + * Clear all affinity bindings for the calling thread. Note, this is a change as of version + * 8.0 as older versions cleared the affinity for a calling process and all children. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully unbound + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); + +/** + * Retrieve the common ancestor for two devices + * For all products. + * Supported on Linux only. + * + * @param device1 The identifier of the first device + * @param device2 The identifier of the second device + * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type + * + * @return + * - \ref NVML_SUCCESS if \a pathInfo has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); + +/** + * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level + * For all products. + * Supported on Linux only. + * + * @param device The identifier of the first device + * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found at \a level + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the set of GPUs that have a CPU affinity with the given CPU number + * For all products. + * Supported on Linux only. + * + * @param cpuNumber The CPU number + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the status for a given p2p capability index between a given pair of GPU + * + * @param device1 The first device + * @param device2 The second device + * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 + * @param p2pStatus Reference in which to return the status of the \a p2pIndex + * between \a device1 and \a device2 + * @return + * - \ref NVML_SUCCESS if \a p2pStatus has been populated + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); + +/** + * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, + * that augments the immutable, board serial identifier. + * + * For all products. + * + * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. + * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param uuid Reference in which to return the GPU UUID + * @param length The maximum allowed length of the string returned in \a uuid + * + * @return + * - \ref NVML_SUCCESS if \a uuid has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); + +/** + * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for + * each GPU will have the form /dev/nvidia[minor number]. + * + * For all products. + * Supported only for Linux + * + * @param device The identifier of the target device + * @param minorNumber Reference in which to return the minor number for the device + * @return + * - \ref NVML_SUCCESS if the minor number is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); + +/** + * Retrieves the the device board part number which is programmed into the board's InfoROM + * + * For all products. + * + * @param device Identifier of the target device + * @param partNumber Reference to the buffer to return + * @param length Length of the buffer reference + * + * @return + * - \ref NVML_SUCCESS if \a partNumber has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); + +/** + * Retrieves the version information for the device's infoROM object. + * + * For all products with an inforom. + * + * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate + * ECC counts. The version of the data structures in this memory may change from time to time. It will not + * exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * See \ref nvmlInforomObject_t for details on the available infoROM objects. + * + * @param device The identifier of the target device + * @param object The target infoROM object + * @param version Reference in which to return the infoROM version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomImageVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); + +/** + * Retrieves the global infoROM image version + * + * For all products with an inforom. + * + * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board + * in contrast to infoROM object version which is only an indicator of supported features. + * Version string will not exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference in which to return the infoROM image version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Retrieves the checksum of the configuration stored in the device's infoROM. + * + * For all products with an inforom. + * + * Can be used to make sure that two GPUs have the exact same configuration. + * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. + * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) + * + * @param device The identifier of the target device + * @param checksum Reference in which to return the infoROM configuration checksum + * + * @return + * - \ref NVML_SUCCESS if \a checksum has been set + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); + +/** + * Reads the infoROM from the flash and verifies the checksums. + * + * For all products with an inforom. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if infoROM is not corrupted + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); + +/** + * Retrieves the display mode for the device. + * + * For all products. + * + * This method indicates whether a physical display (e.g. monitor) is currently connected to + * any of the device's connectors. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param display Reference in which to return the display mode + * + * @return + * - \ref NVML_SUCCESS if \a display has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); + +/** + * Retrieves the display active state for the device. + * + * For all products. + * + * This method indicates whether a display is initialized on the device. + * For example whether X Server is attached to this device and has allocated memory for the screen. + * + * Display can be active even when no monitor is physically attached. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param isActive Reference in which to return the display active state + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); + +/** + * Retrieves the persistence mode associated with this device. + * + * For all products. + * For Linux only. + * + * When driver persistence mode is enabled the driver software state is not torn down when the last + * client disconnects. By default this feature is disabled. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current driver persistence mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the PCI attributes of this device. + * + * For all products. + * + * See \ref nvmlPciInfo_t for details on the available PCI info. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info + * + * @return + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); + +/** + * Retrieves the maximum PCIe link generation possible with this device and system + * + * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will + * report is generation 1. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkGen Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); + +/** + * Retrieves the maximum PCIe link width possible with this device and system + * + * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report + * a max link width of 8. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkWidth Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); + +/** + * Retrieves the current PCIe link generation + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkGen Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); + +/** + * Retrieves the current PCIe link width + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkWidth Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); + +/** + * Retrieve PCIe utilization information. + * This function is querying a byte counter over a 20ms interval and thus is the + * PCIe throughput over that interval. + * + * For Maxwell &tm; or newer fully supported devices. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t + * @param value Reference in which to return throughput in KB/s + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); + +/** + * Retrieve the PCIe replay counter. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param value Reference in which to return the counter's value + * + * @return + * - \ref NVML_SUCCESS if \a value and \a rollover have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value or \a rollover are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); + +/** + * Retrieves the current clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the maximum clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks + * by few MHz. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. + * Can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the default applications clock that GPU boots with or + * defaults to after \ref nvmlDeviceResetApplicationsClocks call. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the default clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * \see nvmlDeviceGetApplicationsClock + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Resets the application clock to the default value + * + * This is the applications clock that will be used after system reboot or driver reload. + * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, + * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above + * base clocks as thermal limits allow. + * + * @see nvmlDeviceGetApplicationsClock + * @see nvmlDeviceSetApplicationsClocks + * + * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); + +/** + * Retrieves the clock speed for the clock specified by the clock type and clock ID. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockId Identify which clock in the domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); + +/** + * Retrieves the customer defined maximum boost clock speed specified by the given clock type. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of + * required elements) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedGraphicsClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param memoryClockMHz Memory clock for which to return possible graphics clocks + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clocks in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedMemoryClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. + * + * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device + * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will + * revert to when no applications are using the GPU + * + * @return + * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); + +/** + * Try to set the current state of Auto Boosted clocks on a device. + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * Non-root users may use this API by default but can be restricted by root from using this API by calling + * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. + * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set Auto Boosted clocks of the target device to + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); + +/** + * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will + * return to when no compute running processes (e.g. CUDA application which have an active context) are running + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set default Auto Boosted clocks of the target device to + * @param flags Flags that change the default behavior. Currently Unused. + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); + + +/** + * Retrieves the intended operating speed of the device's fan. + * + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%. + * + * @param device The identifier of the target device + * @param speed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); + +/** + * Retrieves the current temperature readings for the device, in degrees C. + * + * For all products. + * + * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. + * + * @param device The identifier of the target device + * @param sensorType Flag that indicates which sensor reading to retrieve + * @param temp Reference in which to return the temperature reading + * + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); + +/** + * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. + * + * @param device The identifier of the target device + * @param thresholdType The type of threshold value queried + * @param temp Reference in which to return the temperature reading + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); + +/** + * Retrieves the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * Retrieves current clocks throttling reasons. + * + * For all fully supported products. + * + * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. + * + * @param device The identifier of the target device + * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle + * reasons + * + * @return + * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetSupportedClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); + +/** + * Retrieves bitmask of supported clocks throttle reasons that can be returned by + * \ref nvmlDeviceGetCurrentClocksThrottleReasons + * + * For all fully supported products. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported + * clocks throttle reasons + * + * @return + * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetCurrentClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); + +/** + * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. + * + * Retrieve the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * This API has been deprecated. + * + * Retrieves the power management mode associated with this device. + * + * For products from the Fermi family. + * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. + * + * For from the Kepler or newer families. + * - Does not require \a NVML_INFOROM_POWER object. + * + * This flag indicates whether any power management algorithm is currently active on the device. An + * enabled state does not necessarily mean the device is being actively throttled -- only that + * that the driver will do so if the appropriate conditions are met. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current power management mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the power management limit associated with this device. + * + * For Fermi &tm; or newer fully supported devices. + * + * The power limit defines the upper boundary for the card's power draw. If + * the card's total power draw reaches this limit the power management algorithm kicks in. + * + * This reading is only available if power management mode is supported. + * See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves information about possible values of power management limits on this device. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minLimit Reference in which to return the minimum power management limit in milliwatts + * @param maxLimit Reference in which to return the maximum power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPowerManagementLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); + +/** + * Retrieves default power management limit on this device, in milliwatts. + * Default power management limit is a power management limit that the device boots with. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param defaultLimit Reference in which to return the default power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a defaultLimit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); + +/** + * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * + * For Fermi &tm; or newer fully supported devices. + * + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param power Reference in which to return the power usage information + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + +/** + * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded + * + * For newer than Pascal &tm; fully supported devices. + * + * @param device The identifier of the target device + * @param energy Reference in which to return the energy consumption information + * + * @return + * - \ref NVML_SUCCESS if \a energy has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); + +/** + * Get the effective power limit that the driver enforces after taking into account all limiters + * + * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere + * This includes the out of band power limit interface + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The device to communicate with + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current GOM + * @param pending Reference in which to return the pending GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceSetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); + +/** + * Retrieves the amount of used, free and total memory available on the device, in bytes. + * + * For all products. + * + * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. + * Under WDDM most device memory is allocated and managed on startup by Windows. + * + * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated + * by all active channels on the device. + * + * See \ref nvmlMemory_t for details on available memory info. + * + * @param device The identifier of the target device + * @param memory Reference in which to return the memory information + * + * @return + * - \ref NVML_SUCCESS if \a memory has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); + +/** + * Retrieves the current compute mode for the device. + * + * For all products. + * + * See \ref nvmlComputeMode_t for details on allowed compute modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current compute mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); + +/** + * Retrieves the CUDA compute capability of the device. + * + * For all products. + * + * Returns the major and minor compute capability version numbers of the + * device. The major and minor versions are equivalent to the + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be + * returned by CUDA's cuDeviceGetAttribute(). + * + * @param device The identifier of the target device + * @param major Reference in which to return the major CUDA compute capability + * @param minor Reference in which to return the minor CUDA compute capability + * + * @return + * - \ref NVML_SUCCESS if \a major and \a minor have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); + +/** + * Retrieves the current and pending ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); + +/** + * Retrieves the device boardId from 0-N. + * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with + * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. + * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across + * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and + * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will + * always return those values but they will always be different from each other). + * + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param boardId Reference in which to return the device's board ID + * + * @return + * - \ref NVML_SUCCESS if \a boardId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); + +/** + * Retrieves whether the device is on a Multi-GPU Board + * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param multiGpuBool Reference in which to return a zero or non-zero value + * to indicate whether the device is on a multi GPU board + * + * @return + * - \ref NVML_SUCCESS if \a multiGpuBool has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); + +/** + * Retrieves the total ECC error counts for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires ECC Mode to be enabled. + * + * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of + * errors across the entire device. + * + * See \ref nvmlMemoryErrorType_t for a description of available error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); + +/** + * Retrieves the detailed ECC error counts for the device. + * + * @deprecated This API supports only a fixed set of ECC error locations + * On different GPU architectures different locations are supported + * See \ref nvmlDeviceGetMemoryErrorCounter + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. + * Requires ECC Mode to be enabled. + * + * Detailed errors provide separate ECC counts for specific parts of the memory system. + * + * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. + * + * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); + +/** + * Retrieves the requested memory error counter for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. + * + * Only applicable to devices with ECC. + * + * Requires ECC Mode to be enabled. + * + * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of error. + * @param counterType Flag that specifies the counter-type of the errors. + * @param locationType Specifies the location of the counter. + * @param count Reference in which to return the ECC counter + * + * @return + * - \ref NVML_SUCCESS if \a count has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is + * invalid, or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, + nvmlEccCounterType_t counterType, + nvmlMemoryLocation_t locationType, unsigned long long *count); + +/** + * Retrieves the current utilization rates for the device's major subsystems. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlUtilization_t for details on available utilization rates. + * + * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. + * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. + * + * @param device The identifier of the target device + * @param utilization Reference in which to return the utilization information + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Encoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for encoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param encoderQueryType Type of encoder to query + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); + +/** + * Retrieves the current encoder statistics for a given device. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, + * or \a averageLatency is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about active encoder sessions on a target device. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The + * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. + * @param sessionInfos Reference in which to return the session information + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfos is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Decoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for decoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current and pending driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current driver model + * @param pending Reference in which to return the pending driver model + * + * @return + * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); + +/** + * Get VBIOS version of the device. + * + * For all products. + * + * The VBIOS version may change from time to time. It will not exceed 32 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference to which to return the VBIOS version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Get Bridge Chip Information for all the bridge chips on the board. + * + * For all fully supported products. + * Only applicable to multi-GPU products. + * + * @param device The identifier of the target device + * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy + * + * @return + * - \ref NVML_SUCCESS if bridge chip exists + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); + +/** + * Get information about processes with a compute context on a device + * + * For Fermi &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @param device The identifier of the target device + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Get information about processes with a graphics context on a device + * + * For Kepler &tm; or newer fully supported devices. + * + * This function returns information only about graphics based processes + * (eg. applications using OpenGL, DirectX) + * + * To query the current number of running graphics processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new graphics processes are spawned. + * + * @param device The identifier of the target device + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Check if the GPU devices are on the same physical board. + * + * For all fully supported products. + * + * @param device1 The first GPU device + * @param device2 The second GPU device + * @param onSameBoard Reference in which to return the status. + * Non-zero indicates that the GPUs are on the same board. + * + * @return + * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + +/** + * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. + * + * For all fully supported products. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted Reference in which to return the current restriction + * NVML_FEATURE_ENABLED indicates that the API is root-only + * NVML_FEATURE_DISABLED indicates that the API is accessible to all users + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support + * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is + * not supported by the device) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); + +/** + * Gets recent samples for the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by + * the driver. + * + * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. + * + * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. + * The returned samplesCount will provide the number of samples that can be queried. The user needs to + * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). + * + * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the + * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query + * to get more recent samples. + * + * This method fetches the number of entries which can be accommodated in the provided samples array, and the + * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this + * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. + * + * @param device The identifier for the target device + * @param type Type of sampling event + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t + * @param sampleCount Reference to provide the number of elements which can be queried in samples array + * @param samples Reference in which samples are returned + + * @return + * - \ref NVML_SUCCESS if samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or + * reference to \a sampleCount is 0 for non null \a samples + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); + +/** + * Gets Total, Available and Used size of BAR1 memory. + * + * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party + * devices (peer-to-peer on the PCIE bus). + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param bar1Memory Reference in which BAR1 memory + * information is returned. + * + * @return + * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); + + +/** + * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power + * or thermal constraints. + * + * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The + * difference in violation times at two different reference times gives the indication of GPU throttling event. + * + * Violation for thermal capping is not supported at this time. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param perfPolicyType Represents Performance policy which can trigger GPU throttling + * @param violTime Reference to which violation time related information is returned + * + * + * @return + * - \ref NVML_SUCCESS if violation time is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Queries the state of per process accounting mode. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlDeviceGetAccountingStats for more details. + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Queries process's accounting stats. + * + * For Kepler &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. + * Accounting stats can be queried during life time of the process and after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. + * + * @param device The identifier of the target device + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL + * - \ref NVML_ERROR_NOT_FOUND if process stats were not found + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Queries list of processes that can be queried for accounting stats. The list of processes returned + * can be in running or terminated state. + * + * For Kepler &tm; or newer fully supported devices. + * + * To just query the number of processes ready to be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlDeviceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to + * expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); + +/** + * Returns the number of processes that the circular buffer with accounting pids can hold. + * + * For Kepler &tm; or newer fully supported devices. + * + * This is the maximum number of processes that accounting information will be stored for before information + * about oldest processes will get overwritten by information about new processes. + * + * @param device The identifier of the target device + * @param bufferSize Reference in which to provide the size (in number of elements) + * of the circular buffer for accounting stats. + * + * @return + * - \ref NVML_SUCCESS if buffer size was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingStats + * @see nvmlDeviceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); + +/** @} */ + +/** @addtogroup nvmlDeviceQueries + * @{ + */ + +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses); + +/** + * Check if any pages are pending retirement and need a reboot to fully retire. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param isPending Reference in which to return the pending status + * + * @return + * - \ref NVML_SUCCESS if \a isPending was populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitCommands Unit Commands + * This chapter describes NVML operations that change the state of the unit. For S-class products. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the LED state for the unit. The LED can be either green (0) or amber (1). + * + * For S-class products. + * Requires root/admin permissions. + * + * This operation takes effect immediately. + * + * + * Current S-Class products don't provide unique LEDs for each unit. As such, both front + * and back LEDs will be toggled in unison regardless of which unit is specified with this command. + * + * See \ref nvmlLedColor_t for available colors. + * + * @param unit The identifier of the target unit + * @param color The target LED color + * + * @return + * - \ref NVML_SUCCESS if the LED color has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitGetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceCommands Device Commands + * This chapter describes NVML operations that change the state of the device. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the persistence mode for the device. + * + * For all products. + * For Linux only. + * Requires root/admin permissions. + * + * The persistence mode determines whether the GPU driver software is torn down after the last client + * exits. + * + * This operation takes effect immediately. It is not persistent across reboots. After each reboot the + * persistence mode is reset to "Disabled". + * + * See \ref nvmlEnableState_t for available modes. + * + * @param device The identifier of the target device + * @param mode The target persistence mode + * + * @return + * - \ref NVML_SUCCESS if the persistence mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Set the compute mode for the device. + * + * For all products. + * Requires root/admin permissions. + * + * The compute mode determines whether a GPU can be used for compute operations and whether it can + * be shared across contexts. + * + * This operation takes effect immediately. Under Linux it is not persistent across reboots and + * always resets to "Default". Under windows it is persistent. + * + * Under windows compute mode may only be set to DEFAULT when running in WDDM + * + * See \ref nvmlComputeMode_t for details on available compute modes. + * + * @param device The identifier of the target device + * @param mode The target compute mode + * + * @return + * - \ref NVML_SUCCESS if the compute mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); + +/** + * Set the ECC mode for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires root/admin permissions. + * + * The ECC mode determines whether the GPU enables its ECC support. + * + * This operation takes effect after the next reboot. + * + * See \ref nvmlEnableState_t for details on available modes. + * + * @param device The identifier of the target device + * @param ecc The target ECC mode + * + * @return + * - \ref NVML_SUCCESS if the ECC mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); + +/** + * Clear the ECC error and other memory error counts for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. + * Requires root/admin permissions. + * Requires ECC Mode to be enabled. + * + * Sets all of the specified ECC counters to 0, including both detailed and total counts. + * + * This operation takes effect immediately. + * + * See \ref nvmlMemoryErrorType_t for details on available counter types. + * + * @param device The identifier of the target device + * @param counterType Flag that indicates which type of errors should be cleared. + * + * @return + * - \ref NVML_SUCCESS if the error counts were cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see + * - nvmlDeviceGetDetailedEccErrors() + * - nvmlDeviceGetTotalEccErrors() + */ +nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); + +/** + * Set the driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * Requires root/admin permissions. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. + * + * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). + * This should only be done if the host is subsequently powered down and the display is detached from the device + * before the next reboot. + * + * This operation takes effect after the next reboot. + * + * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. + * + * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or + * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * See \ref nvmlFlagDefault and \ref nvmlFlagForce + * + * @param device The identifier of the target device + * @param driverModel The target driver model + * @param flags Flags that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if the driver model has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); + +/** + * Set clocks that applications will lock to. + * + * Sets the clocks that compute and graphics applications will be running at. + * e.g. CUDA driver requests these clocks during context creation which means this property + * defines clocks at which CUDA applications will be running unless some overspec event + * occurs (e.g. over power, over thermal or external HW brake). + * + * Can be used as a setting to request constant performance. + * + * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. + * + * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call + * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting + * above the clock value being set. + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks + * for details on how to list available clocks combinations. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetApplicationsClocks. + * + * @param device The identifier of the target device + * @param memClockMHz Requested memory clock in MHz + * @param graphicsClockMHz Requested graphics clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); + +/** + * Set new power limit of this device. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * @param device The identifier of the target device + * @param limit Power management limit in milliwatts to set + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPowerManagementLimitConstraints + * @see nvmlDeviceGetPowerManagementDefaultLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); + +/** + * Sets new GOM. See \a nvmlGpuOperationMode_t for details. + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * Requires root/admin permissions. + * + * Changing GOMs requires a reboot. + * The reboot requirement might be removed in the future. + * + * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when + * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. + * + * @param device The identifier of the target device + * @param mode Target GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceGetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); + +/** + * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. + * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction + * to query the current restriction settings. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted The target restriction + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support + * the feature that api restrictions are being set for (E.G. Enabling/disabling auto + * boosted clocks is not supported by the device) + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Enables or disables per process accounting. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @note This setting is not persistent and will default to disabled after driver unloads. + * Enable persistence mode to be sure the setting doesn't switch off to disabled. + * + * @note Enabling accounting mode has no negative impact on the GPU performance. + * + * @note Disabling accounting clears all accounting pids information. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceClearAccountingPids + * + * @param device The identifier of the target device + * @param mode The target accounting mode + * + * @return + * - \ref NVML_SUCCESS if the new mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Clears accounting information about all processes that have already terminated. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup NvLink NvLink Methods + * This chapter describes methods that NVML can perform on NVLINK enabled devices. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves the state of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that + * the link is active and NVML_FEATURE_DISABLED indicates it + * is inactive + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); + +/** + * Retrieves the version of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param version Requested NvLink version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); + +/** + * Retrieves the requested capability from the device's NvLink for the link specified + * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried + * The return value should be treated as a boolean. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is available + * + * @return + * - \ref NVML_SUCCESS if \a capResult has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, + nvmlNvLinkCapability_t capability, unsigned int *capResult); + +/** + * Retrieves the PCI information for the remote node on a NvLink link + * Note: pciSubSystemId is not filled in this function and is indeterminate + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param pci \a nvmlPciInfo_t of the remote node for the specified link + * + * @return + * - \ref NVML_SUCCESS if \a pci has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); + +/** + * Retrieves the specified error counter value + * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the NvLink counter to be queried + * @param counterValue Returned counter value + * + * @return + * - \ref NVML_SUCCESS if \a counter has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, + nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); + +/** + * Resets all error counters to zero + * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * + * @return + * - \ref NVML_SUCCESS if the reset is successful + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); + +/** + * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset + * of the counters if the reset parameter is non-zero. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set + * @param reset Resets the counters on set if non-zero + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control, unsigned int reset); + +/** + * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control); + + +/** + * Retrieve the NVLINK utilization counter based on the current control for a specified counter. + * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl + * before reading the utilization counters as they have no default state + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be read (0 or 1). + * @param rxcounter Receive counter return value + * @param txcounter Transmit counter return value + * + * @return + * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, + unsigned long long *rxcounter, unsigned long long *txcounter); + +/** + * Freeze the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be frozen (0 or 1). + * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters + * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters + * + * @return + * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, + unsigned int counter, nvmlEnableState_t freeze); + +/** + * Reset the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be reset + * @param counter Specifies the counter that should be reset (0 or 1) + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlEvents Event Handling Methods + * This chapter describes methods that NVML can perform against each device to register and wait for + * some event to occur. + * @{ + */ +/***************************************************************************************************/ + +/** + * Create an empty set of events. + * Event set should be freed by \ref nvmlEventSetFree + * + * For Fermi &tm; or newer fully supported devices. + * @param set Reference in which to return the event handle + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); + +/** + * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t + * + * For Fermi &tm; or newer fully supported devices. + * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) + * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) + * + * For Linux only. + * + * \b IMPORTANT: Operations on \a set are not thread safe + * + * This call starts recording of events on specific device. + * All events that occurred before this call are not recorded. + * Checking if some event occurred can be done with \ref nvmlEventSetWait + * + * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. + * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes + * are registered in that case. + * + * @param device The identifier of the target device + * @param eventTypes Bitmask of \ref nvmlEventType to record + * @param set Set to which add new event types + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceGetSupportedEventTypes + * @see nvmlEventSetWait + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); + +/** + * Returns information about events supported on device + * + * For Fermi &tm; or newer fully supported devices. + * + * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. + * + * @param device The identifier of the target device + * @param eventTypes Reference in which to return bitmask of supported events + * + * @return + * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); + +/** + * Waits on events and delivers events + * + * For Fermi &tm; or newer fully supported devices. + * + * If some events are ready to be delivered at the time of the call, function returns immediately. + * If there are no events ready to be delivered, function sleeps till event arrives + * but not longer than specified timeout. This function in certain conditions can return before + * specified timeout passes (e.g. when interrupt arrives) + * + * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple + * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all + * xid error events. + * + * @param set Reference to set of events to wait on + * @param data Reference in which to return event data + * @param timeoutms Maximum amount of wait time in milliseconds for registered event + * + * @return + * - \ref NVML_SUCCESS if the data has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL + * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived + * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); + +/** + * Releases events in the set + * + * For Fermi &tm; or newer fully supported devices. + * + * @param set Reference to events to be released + * + * @return + * - \ref NVML_SUCCESS if the event has been successfully released + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlZPI Drain states + * This chapter describes methods that NVML can perform against each device to control their drain state + * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to + * power on/off GPUs, enable robust reset scenarios, etc. + * @{ + */ +/***************************************************************************************************/ + +/** + * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. + * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before + * this call is made. + * Must be called as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be modified + * @param newState The drain state that should be entered, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); + +/** + * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining + * state. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be queried + * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); + +/** + * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver + * as long as no other processes are attached. If other processes are attached, this call will return + * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the + * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called + * to initiate the draining state is if that process was using, and is still using, a GPU before the + * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled + * prior to this call. + * + * For long-running NVML processes please note that this will change the enumeration of current GPUs. + * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. + * Also, device handles after the removed GPU will not be valid and must be re-established. + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU to be removed + * @param gpuState Whether the GPU is to be removed, from the OS + * see \ref nvmlDetachGpuState_t + * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed + */ +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); + +/** + * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that + * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. + * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes + * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. + * + * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds + * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. + * + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device + * fields are used in this call. + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature + * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFieldValueQueries Field Value Queries + * This chapter describes NVML operations that are associated with retrieving Field Values from NVML + * @{ + */ +/***************************************************************************************************/ + +/** + * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. + * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs + * will be populated from a single call rather than making a driver call for each fieldId. + * + * @param device The device handle of the GPU to request field values for + * @param valuesCount Number of entries in values that should be retrieved + * @param values Array of \a valuesCount structures to hold field values. + * Each value's fieldId must be populated prior to this call + * + * @return + * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must + * check the nvmlReturn field of each value for each individual + * status + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlGridQueries Grid Queries + * This chapter describes NVML operations that are associated with NVIDIA GRID products. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to get the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlGridCommands Grid Commands + * This chapter describes NVML operations that are associated with NVIDIA GRID products. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to set the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. + * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpu vGPU Management + * @{ + * + * Set of APIs supporting GRID vGPU + */ +/***************************************************************************************************/ + +/** + * Retrieve the supported vGPU types on a physical GPU (device). + * + * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the currently creatable vGPU types on a physical GPU (device). + * + * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types + * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable + * list will be restricted to whatever vGPU type is already running on the device. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeClass Pointer to string array to return class in + * @param size Size of string + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); + +/** + * Retrieve the vGPU type name. + * + * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not + * exceed 64 characters in length (including the NUL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeName Pointer to buffer to return name + * @param size Size of buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); + +/** + * Retrieve the device ID of a vGPU type. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value + * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); + +/** + * Retrieve the vGPU framebuffer size in bytes. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param fbSize Pointer to framebuffer size in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); + +/** + * Retrieve count of vGPU's supported display heads. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param numDisplayHeads Pointer to number of display heads + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); + +/** + * Retrieve vGPU display head's maximum supported resolution. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param displayIndex Zero-based index of display head + * @param xdim Pointer to maximum number of pixels in X dimension + * @param ydim Pointer to maximum number of pixels in Y dimension + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex + * is out of range. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); + +/** + * Retrieve license requirements for a vGPU type + * + * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form + * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, + * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". + * + * The total length of the returned string will not exceed 128 characters, including the NUL terminator. + * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeLicenseString Pointer to buffer to return license info + * @param size Size of \a vgpuTypeLicenseString buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); + +/** + * Retrieve the static frame rate limit value of the vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param frameRateLimit Reference to return the frame rate limit value + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); + +/** + * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCount Pointer to get the max number of vGPU instances + * that can be created on a deicve for given vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, + * or \a vgpuInstanceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); + +/** + * Retrieve the active vGPU instances on a device. + * + * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The + * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. + * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return + * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer which passes in the array size as well as get + * back the number of types + * @param vgpuInstances Pointer to array in which to return list of vGPU instances + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); + +/** + * Retrieve the VM ID associated with a vGPU instance. + * + * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vmId Pointer to caller-supplied buffer to hold VM ID + * @param size Size of buffer in bytes + * @param vmIdType Pointer to hold VM ID type + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vmId or \a vmIdType are NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); + +/** + * Retrieve the UUID of a vGPU instance. + * + * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a uuid is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); + +/** + * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. + * + * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version + * string will not exceed 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is + * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the + * NVIDIA driver is loaded and initialized. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param version Caller-supplied buffer to return driver version string + * @param length Size of \a version buffer + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); + +/** + * Retrieve the framebuffer usage in bytes. + * + * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target instance + * @param fbUsage Pointer to framebuffer usage in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a fbUsage is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); + +/** + * Retrieve the current licensing state of the vGPU instance. + * + * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param licensed Reference to return the licensing status + * + * @return + * - \ref NVML_SUCCESS if \a licensed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a licensed is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); + +/** + * Retrieve the vGPU type of a vGPU instance. + * + * Returns the vGPU type ID of vgpu assigned to the vGPU instance. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vgpuTypeId Reference to return the vgpuTypeId + * + * @return + * - \ref NVML_SUCCESS if \a vgpuTypeId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vgpuTypeId is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); + +/** + * Retrieve the frame rate limit set for the vGPU instance. + * + * Returns the value of the frame rate limit set for the vGPU instance + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param frameRateLimit Reference to return the frame rate limit + * + * @return + * - \ref NVML_SUCCESS if \a frameRateLimit has been set + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); + +/** + * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a encoderQueryType is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); + +/** + * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Unsigned int for the encoder capacity value + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); + +/** + * Retrieves current utilization for vGPUs on a physical GPU (device). + * + * For Kepler &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running + * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer + * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the + * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values + * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to + * indicate the returned value type. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate + * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample + * structures that were actually written. This may differ from a previously read value as vGPU instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values + * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is + * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, + nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); + +/** + * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on + * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the + * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running + * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which + * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size + * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample + * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active + * in any given sample period. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is + * passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + unsigned int *vgpuProcessSamplesCount, + nvmlVgpuProcessUtilizationSample_t *utilizationSamples); +/** + * Retrieve the GRID licensable features. + * + * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) + * and their current license status. + * + * @param device Identifier of the target device + * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned + * + * @return + * - \ref NVML_SUCCESS if licensable features are successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); + +/** + * Retrieves the current encoder statistics of a vGPU Instance + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL + * or \a vgpuInstance is invalid. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about all active encoder sessions on a vGPU Instance. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The + * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to caller supplied array size, and returns + * the number of sessions. + * @param sessionInfo Reference to caller supplied array in which the list + * of session information us returned. + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfo is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is + returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL or \a vgpuInstance is invalid.. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); + +/** + * Retrieves the current utilization and process ID + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. + * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilization set to NULL. The caller should allocate a buffer of size + * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a processSamplesCount with the number of process utilization sample + * structures that were actually written. This may differ from a previously read value as instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier of the target device + * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned + * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvml vGPU Migration + * This chapter describes NVML operations that are associated with vGPU Migration. + * @{ + */ +/***************************************************************************************************/ + +/** + * vGPU metadata structure. + */ +typedef struct nvmlVgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields + char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host + unsigned int reserved[8]; //!< Reserved for internal use + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuMetadata_t; + +/** + * Physical GPU metadata structure + */ +typedef struct nvmlVgpuPgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version + unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld + unsigned int reserved[7]; //!< Reserved for internal use + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuPgpuMetadata_t; + +/** + * vGPU VM compatibility codes + */ +typedef enum nvmlVgpuVmCompatibility_enum +{ + NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable + NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) + NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) + NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) + NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0) +} nvmlVgpuVmCompatibility_t; + +/** + * vGPU-pGPU compatibility limit codes + */ +typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum +{ + NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. + NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< Compatibility is limited by host driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. + NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor. +} nvmlVgpuPgpuCompatibilityLimitCode_t; + +/** + * vGPU-pGPU compatibility structure + */ +typedef struct nvmlVgpuPgpuCompatibility_st +{ + nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t + nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t +} nvmlVgpuPgpuCompatibility_t; + +/** + * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM + * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section + * containing internal state. + * + * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are + * dependent on information obtained from the guest VM, which may not yet have reached a state where that information + * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field. + * + * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide + * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. + * + * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure + * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param vgpuInstance vGPU instance handle + * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written + * @param bufferSize Size of vgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is invalid; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); + +/** + * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about + * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section + * containing internal state. + * + * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata + * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param device The identifier of the target device + * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written + * @param bufferSize Pointer to size of \a pgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS GPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); + +/** + * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a + * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the + * physical GPU. + * + * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The + * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility + * with the physical GPU is limited, a limit code indicates the factor limiting compability. + * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). + * + * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to + * boot a given vGPU or associated VM. + * + * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure + * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure + * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); + +/** @} */ + +/** + * NVML API versioning support + */ +#if defined(__NVML_API_VERSION_INTERNAL) +#undef nvmlDeviceRemoveGpu +#undef nvmlDeviceGetNvLinkRemotePciInfo +#undef nvmlDeviceGetPciInfo +#undef nvmlDeviceGetCount +#undef nvmlDeviceGetHandleByIndex +#undef nvmlDeviceGetHandleByPciBusId +#undef nvmlInit +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c new file mode 100644 index 000000000..a3d162c0e --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c @@ -0,0 +1,46 @@ +// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + +#include +#include + +#include "nvml_dl.h" + +#define DLSYM(x, sym) \ +do { \ + dlerror(); \ + x = dlsym(handle, #sym); \ + if (dlerror() != NULL) { \ + return (NVML_ERROR_FUNCTION_NOT_FOUND); \ + } \ +} while (0) + +typedef nvmlReturn_t (*nvmlSym_t)(); + +static void *handle; + +nvmlReturn_t NVML_DL(nvmlInit)(void) +{ + handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL); + if (handle == NULL) { + return (NVML_ERROR_LIBRARY_NOT_FOUND); + } + return (nvmlInit()); +} + +nvmlReturn_t NVML_DL(nvmlShutdown)(void) +{ + nvmlReturn_t r = nvmlShutdown(); + if (r != NVML_SUCCESS) { + return (r); + } + return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS); +} + +nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)( + nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info) +{ + nvmlSym_t sym; + + DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor); + return ((*sym)(dev1, dev2, info)); +} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h new file mode 100644 index 000000000..628f0b3a2 --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h @@ -0,0 +1,15 @@ +// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + +#ifndef _NVML_DL_H_ +#define _NVML_DL_H_ + +#include "nvml.h" + +#define NVML_DL(x) x##_dl + +extern nvmlReturn_t NVML_DL(nvmlInit)(void); +extern nvmlReturn_t NVML_DL(nvmlShutdown)(void); +extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)( + nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *); + +#endif // _NVML_DL_H_ diff --git a/vendor/vendor.json b/vendor/vendor.json index 941f55cfa..779cb773f 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -9,6 +9,8 @@ {"path":"github.com/Azure/go-ansiterm/winterm","checksumSHA1":"jBimnggjIiFUjaImNoJhSVLtdzw=","revision":"fa152c58bc15761d0200cb75fe958b89a9d4888e","revisionTime":"2016-06-22T17:32:16Z"}, {"path":"github.com/DataDog/datadog-go/statsd","checksumSHA1":"WvApwvvSe3i/3KO8300dyeFmkbI=","revision":"b10af4b12965a1ad08d164f57d14195b4140d8de","revisionTime":"2017-08-09T10:47:06Z"}, {"path":"github.com/Microsoft/go-winio","checksumSHA1":"AzjRkOQtVBTwIw4RJLTygFhJs3s=","revision":"f533f7a102197536779ea3a8cb881d639e21ec5a","revisionTime":"2017-05-24T00:36:31Z"}, + {"path":"github.com/NVIDIA/gpu-monitoring-tools","checksumSHA1":"kF1vk+8Xvb3nGBiw9+qbUc0SZ4M=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"}, + {"path":"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml","checksumSHA1":"P8FATSSgpe5A17FyPrGpsX95Xw8=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"}, {"path":"github.com/NYTimes/gziphandler","checksumSHA1":"jktW57+vJsziNVPeXMCoujTzdW4=","revision":"97ae7fbaf81620fe97840685304a78a306a39c64","revisionTime":"2017-09-16T00:36:49Z"}, {"path":"github.com/Nvveen/Gotty","checksumSHA1":"Aqy8/FoAIidY/DeQ5oTYSZ4YFVc=","revision":"cd527374f1e5bff4938207604a14f2e38a9cf512","revisionTime":"2012-06-04T00:48:16Z"}, {"path":"github.com/RackSec/srslog","checksumSHA1":"OTN4c1F0p+mEG2CpkU1Kuavupf0=","revision":"259aed10dfa74ea2961eddd1d9847619f6e98837","revisionTime":"2016-01-20T22:33:50Z"},