mirror of
https://github.com/kemko/nomad.git
synced 2026-01-08 19:35:41 +03:00
Merge pull request #4638 from oleksii-shyman/nvidia-plugin
WIP :: Nvidia Plugin
This commit is contained in:
@@ -57,11 +57,16 @@ func Int64ToPtr(i int64) *int64 {
|
||||
return &i
|
||||
}
|
||||
|
||||
// UintToPtr returns the pointer to an uint
|
||||
// Uint64ToPtr returns the pointer to an uint64
|
||||
func Uint64ToPtr(u uint64) *uint64 {
|
||||
return &u
|
||||
}
|
||||
|
||||
// UintToPtr returns the pointer to an uint
|
||||
func UintToPtr(u uint) *uint {
|
||||
return &u
|
||||
}
|
||||
|
||||
// StringToPtr returns the pointer to a string
|
||||
func StringToPtr(str string) *string {
|
||||
return &str
|
||||
|
||||
23
plugins/device/cmd/nvidia/README.md
Normal file
23
plugins/device/cmd/nvidia/README.md
Normal file
@@ -0,0 +1,23 @@
|
||||
This package provides an implementation of nvidia device plugin
|
||||
|
||||
# Behavior
|
||||
|
||||
Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period.
|
||||
|
||||
# Config
|
||||
|
||||
The configuration should be passed via an HCL file that begins with a top level `config` stanza:
|
||||
|
||||
```
|
||||
config {
|
||||
ignored_gpu_ids = ["uuid1", "uuid2"]
|
||||
fingerprint_period = "5s"
|
||||
stats_period = "5s"
|
||||
}
|
||||
```
|
||||
|
||||
The valid configuration options are:
|
||||
|
||||
* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
|
||||
* `fingerprint_period` (`string`: `"5s"`): The interval to repeat fingerprint process to identify possible changes.
|
||||
* `stats_period` (`string`: `"5s"`): The interval at which to emit statistics about the devices.
|
||||
18
plugins/device/cmd/nvidia/cmd/main.go
Normal file
18
plugins/device/cmd/nvidia/cmd/main.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/nomad/plugins"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Serve the plugin
|
||||
plugins.Serve(factory)
|
||||
}
|
||||
|
||||
// factory returns a new instance of the Nvidia GPU plugin
|
||||
func factory(log log.Logger) interface{} {
|
||||
return nvidia.NewNvidiaDevice(log)
|
||||
}
|
||||
209
plugins/device/cmd/nvidia/device.go
Normal file
209
plugins/device/cmd/nvidia/device.go
Normal file
@@ -0,0 +1,209 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/nomad/plugins/base"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
"github.com/hashicorp/nomad/plugins/shared/hclspec"
|
||||
)
|
||||
|
||||
const (
|
||||
// pluginName is the name of the plugin
|
||||
pluginName = "nvidia-gpu"
|
||||
|
||||
// vendor is the vendor providing the devices
|
||||
vendor = "nvidia"
|
||||
|
||||
// deviceType is the type of device being returned
|
||||
deviceType = device.DeviceTypeGPU
|
||||
|
||||
// notAvailable value is returned to nomad server in case some properties were
|
||||
// undetected by nvml driver
|
||||
notAvailable = "N/A"
|
||||
)
|
||||
|
||||
const (
|
||||
// Nvidia-container-runtime environment variable names
|
||||
nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||
)
|
||||
|
||||
var (
|
||||
// pluginInfo describes the plugin
|
||||
pluginInfo = &base.PluginInfoResponse{
|
||||
Type: base.PluginTypeDevice,
|
||||
PluginApiVersion: "0.0.1", // XXX This should be an array and should be consts
|
||||
PluginVersion: "0.1.0",
|
||||
Name: pluginName,
|
||||
}
|
||||
|
||||
// configSpec is the specification of the plugin's configuration
|
||||
configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
|
||||
"ignored_gpu_ids": hclspec.NewDefault(
|
||||
hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
|
||||
hclspec.NewLiteral("[]"),
|
||||
),
|
||||
"fingerprint_period": hclspec.NewDefault(
|
||||
hclspec.NewAttr("fingerprint_period", "string", false),
|
||||
hclspec.NewLiteral("\"5s\""),
|
||||
),
|
||||
"stats_period": hclspec.NewDefault(
|
||||
hclspec.NewAttr("stats_period", "string", false),
|
||||
hclspec.NewLiteral("\"5s\""),
|
||||
),
|
||||
})
|
||||
)
|
||||
|
||||
// Config contains configuration information for the plugin.
|
||||
type Config struct {
|
||||
IgnoredGPUIDs []string `codec:"ignored_gpu_ids"`
|
||||
FingerprintPeriod string `codec:"fingerprint_period"`
|
||||
StatsPeriod string `codec:"stats_period"`
|
||||
}
|
||||
|
||||
// NvidiaDevice contains all plugin specific data
|
||||
type NvidiaDevice struct {
|
||||
// nvmlClient is used to get data from nvidia
|
||||
nvmlClient nvml.NvmlClient
|
||||
|
||||
// nvmlClientInitializationError holds an error retrieved during
|
||||
// nvmlClient initialization
|
||||
nvmlClientInitializationError error
|
||||
|
||||
// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
|
||||
ignoredGPUIDs map[string]struct{}
|
||||
|
||||
// fingerprintPeriod is how often we should call nvml to get list of devices
|
||||
fingerprintPeriod time.Duration
|
||||
|
||||
// statsPeriod is how often we should collect statistics for fingerprinted
|
||||
// devices.
|
||||
statsPeriod time.Duration
|
||||
|
||||
// devices is the set of detected eligible devices
|
||||
devices map[string]struct{}
|
||||
deviceLock sync.RWMutex
|
||||
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
// NewNvidiaDevice returns a new nvidia device plugin.
|
||||
func NewNvidiaDevice(log log.Logger) *NvidiaDevice {
|
||||
nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient()
|
||||
logger := log.Named(pluginName)
|
||||
if nvmlClientInitializationError != nil {
|
||||
logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError)
|
||||
}
|
||||
return &NvidiaDevice{
|
||||
logger: logger,
|
||||
devices: make(map[string]struct{}),
|
||||
ignoredGPUIDs: make(map[string]struct{}),
|
||||
nvmlClient: nvmlClient,
|
||||
nvmlClientInitializationError: nvmlClientInitializationError,
|
||||
}
|
||||
}
|
||||
|
||||
// PluginInfo returns information describing the plugin.
|
||||
func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
|
||||
return pluginInfo, nil
|
||||
}
|
||||
|
||||
// ConfigSchema returns the plugins configuration schema.
|
||||
func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
|
||||
return configSpec, nil
|
||||
}
|
||||
|
||||
// SetConfig is used to set the configuration of the plugin.
|
||||
func (d *NvidiaDevice) SetConfig(data []byte) error {
|
||||
var config Config
|
||||
if err := base.MsgPackDecode(data, &config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, ignoredGPUId := range config.IgnoredGPUIDs {
|
||||
d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
|
||||
}
|
||||
|
||||
period, err := time.ParseDuration(config.FingerprintPeriod)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
|
||||
}
|
||||
d.fingerprintPeriod = period
|
||||
|
||||
// Convert the stats period
|
||||
speriod, err := time.ParseDuration(config.StatsPeriod)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse stats period %q: %v", config.StatsPeriod, err)
|
||||
}
|
||||
d.statsPeriod = speriod
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fingerprint streams detected devices. If device changes are detected or the
|
||||
// devices health changes, messages will be emitted.
|
||||
func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
|
||||
outCh := make(chan *device.FingerprintResponse)
|
||||
go d.fingerprint(ctx, outCh)
|
||||
return outCh, nil
|
||||
}
|
||||
|
||||
type reservationError struct {
|
||||
notExistingIDs []string
|
||||
}
|
||||
|
||||
func (e *reservationError) Error() string {
|
||||
return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
|
||||
}
|
||||
|
||||
// Reserve returns information on how to mount given devices.
|
||||
// Assumption is made that nomad server is responsible for correctness of
|
||||
// GPU allocations, handling tricky cases such as double-allocation of single GPU
|
||||
func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
|
||||
if len(deviceIDs) == 0 {
|
||||
return &device.ContainerReservation{}, nil
|
||||
}
|
||||
// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
|
||||
// of race condition
|
||||
//
|
||||
// Timeline:
|
||||
// 1 - fingerprint reports that GPU with id "1" is present
|
||||
// 2 - the following events happen at the same time:
|
||||
// a) server decides to allocate GPU with id "1"
|
||||
// b) fingerprint check reports that GPU with id "1" is no more present
|
||||
//
|
||||
// The latest and always valid version of fingerprinted ids are stored in
|
||||
// d.devices map. To avoid this race condition an error is returned if
|
||||
// any of provided deviceIDs is not found in d.devices map
|
||||
d.deviceLock.RLock()
|
||||
var notExistingIDs []string
|
||||
for _, id := range deviceIDs {
|
||||
if _, deviceIDExists := d.devices[id]; !deviceIDExists {
|
||||
notExistingIDs = append(notExistingIDs, id)
|
||||
}
|
||||
}
|
||||
d.deviceLock.RUnlock()
|
||||
if len(notExistingIDs) != 0 {
|
||||
return nil, &reservationError{notExistingIDs}
|
||||
}
|
||||
|
||||
return &device.ContainerReservation{
|
||||
Envs: map[string]string{
|
||||
nvidiaVisibleDevices: strings.Join(deviceIDs, ","),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Stats streams statistics for the detected devices.
|
||||
func (d *NvidiaDevice) Stats(ctx context.Context) (<-chan *device.StatsResponse, error) {
|
||||
outCh := make(chan *device.StatsResponse)
|
||||
go d.stats(ctx, outCh)
|
||||
return outCh, nil
|
||||
}
|
||||
115
plugins/device/cmd/nvidia/device_test.go
Normal file
115
plugins/device/cmd/nvidia/device_test.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type MockNvmlClient struct {
|
||||
FingerprintError error
|
||||
FingerprintResponseReturned *nvml.FingerprintData
|
||||
|
||||
StatsError error
|
||||
StatsResponseReturned []*nvml.StatsData
|
||||
}
|
||||
|
||||
func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) {
|
||||
return c.FingerprintResponseReturned, c.FingerprintError
|
||||
}
|
||||
|
||||
func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
|
||||
return c.StatsResponseReturned, c.StatsError
|
||||
}
|
||||
|
||||
func TestReserve(t *testing.T) {
|
||||
for _, testCase := range []struct {
|
||||
Name string
|
||||
ExpectedReservation *device.ContainerReservation
|
||||
ExpectedError error
|
||||
Device *NvidiaDevice
|
||||
RequestedIDs []string
|
||||
}{
|
||||
{
|
||||
Name: "All RequestedIDs are not managed by Device",
|
||||
ExpectedReservation: nil,
|
||||
ExpectedError: &reservationError{[]string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
}},
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
logger: hclog.NewNullLogger(),
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "Some RequestedIDs are not managed by Device",
|
||||
ExpectedReservation: nil,
|
||||
ExpectedError: &reservationError{[]string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
}},
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "All RequestedIDs are managed by Device",
|
||||
ExpectedReservation: &device.ContainerReservation{
|
||||
Envs: map[string]string{
|
||||
nvidiaVisibleDevices: "UUID1,UUID2,UUID3",
|
||||
},
|
||||
},
|
||||
ExpectedError: nil,
|
||||
RequestedIDs: []string{
|
||||
"UUID1",
|
||||
"UUID2",
|
||||
"UUID3",
|
||||
},
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID1": {},
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "No IDs requested",
|
||||
ExpectedReservation: &device.ContainerReservation{},
|
||||
ExpectedError: nil,
|
||||
RequestedIDs: nil,
|
||||
Device: &NvidiaDevice{
|
||||
devices: map[string]struct{}{
|
||||
"UUID1": {},
|
||||
"UUID2": {},
|
||||
"UUID3": {},
|
||||
},
|
||||
logger: hclog.NewNullLogger(),
|
||||
},
|
||||
},
|
||||
} {
|
||||
actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs)
|
||||
req := require.New(t)
|
||||
req.Equal(testCase.ExpectedReservation, actualReservation)
|
||||
req.Equal(testCase.ExpectedError, actualError)
|
||||
}
|
||||
}
|
||||
235
plugins/device/cmd/nvidia/fingerprint.go
Normal file
235
plugins/device/cmd/nvidia/fingerprint.go
Normal file
@@ -0,0 +1,235 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
)
|
||||
|
||||
const (
|
||||
// Attribute names for reporting Fingerprint output
|
||||
MemoryMiBAttr = "memory_mib"
|
||||
PowerWAttr = "power_w"
|
||||
BAR1MiBAttr = "bar1_mib"
|
||||
DriverVersionAttr = "driver_version"
|
||||
CoresClockMHzAttr = "cores_clock_mhz"
|
||||
MemoryClockMHzAttr = "memory_clock_mhz"
|
||||
PCIBandwidthMBPerSAttr = "pci_bandwidth_mb/s"
|
||||
DisplayStateAttr = "display_state"
|
||||
PersistenceModeAttr = "persistence_mode"
|
||||
)
|
||||
|
||||
// fingerprint is the long running goroutine that detects hardware
|
||||
func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
|
||||
defer close(devices)
|
||||
|
||||
if d.nvmlClientInitializationError != nil {
|
||||
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError)
|
||||
// write empty fingerprint response to let server know that there are
|
||||
// no working Nvidia GPU units
|
||||
devices <- device.NewFingerprint()
|
||||
return
|
||||
}
|
||||
|
||||
// Create a timer that will fire immediately for the first detection
|
||||
ticker := time.NewTimer(0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
ticker.Reset(d.fingerprintPeriod)
|
||||
}
|
||||
d.writeFingerprintToChannel(devices)
|
||||
}
|
||||
}
|
||||
|
||||
// writeFingerprintToChannel makes nvml call and writes response to channel
|
||||
func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
|
||||
fingerprintData, err := d.nvmlClient.GetFingerprintData()
|
||||
|
||||
if err != nil {
|
||||
d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
|
||||
devices <- device.NewFingerprintError(err)
|
||||
return
|
||||
}
|
||||
|
||||
// ignore devices from fingerprint output
|
||||
fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
|
||||
// check if any device health was updated or any device was added to host
|
||||
if !d.fingerprintChanged(fingerprintDevices) {
|
||||
return
|
||||
}
|
||||
|
||||
commonAttributes := map[string]string{
|
||||
DriverVersionAttr: fingerprintData.DriverVersion,
|
||||
}
|
||||
|
||||
// Group all FingerprintDevices by DeviceName attribute
|
||||
deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
|
||||
for _, device := range fingerprintDevices {
|
||||
deviceName := device.DeviceName
|
||||
if deviceName == nil {
|
||||
// nvml driver was not able to detect device name. This kind
|
||||
// of devices are placed to single group with 'notAvailable' name
|
||||
notAvailableCopy := notAvailable
|
||||
deviceName = ¬AvailableCopy
|
||||
}
|
||||
|
||||
deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
|
||||
}
|
||||
|
||||
// Build Fingerprint response with computed groups and send it over the channel
|
||||
deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
|
||||
for groupName, devices := range deviceListByDeviceName {
|
||||
deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
|
||||
}
|
||||
devices <- device.NewFingerprint(deviceGroups...)
|
||||
}
|
||||
|
||||
// ignoreFingerprintedDevices excludes ignored devices from fingerprint output
|
||||
func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
|
||||
var result []*nvml.FingerprintDeviceData
|
||||
for _, fingerprintDevice := range deviceData {
|
||||
if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
|
||||
result = append(result, fingerprintDevice)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// fingerprintChanged checks if there are any previously unseen nvidia devices located
|
||||
// or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
|
||||
// Also, this func updates device map on NvidiaDevice with the latest data
|
||||
func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
|
||||
d.deviceLock.Lock()
|
||||
defer d.deviceLock.Unlock()
|
||||
|
||||
changeDetected := false
|
||||
// check if every device in allDevices is in d.devices
|
||||
for _, device := range allDevices {
|
||||
if _, ok := d.devices[device.UUID]; !ok {
|
||||
changeDetected = true
|
||||
}
|
||||
}
|
||||
|
||||
// check if every device in d.devices is in allDevices
|
||||
fingerprintDeviceMap := make(map[string]struct{})
|
||||
for _, device := range allDevices {
|
||||
fingerprintDeviceMap[device.UUID] = struct{}{}
|
||||
}
|
||||
for id := range d.devices {
|
||||
if _, ok := fingerprintDeviceMap[id]; !ok {
|
||||
changeDetected = true
|
||||
}
|
||||
}
|
||||
|
||||
d.devices = fingerprintDeviceMap
|
||||
return changeDetected
|
||||
}
|
||||
|
||||
// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
|
||||
func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]string) *device.DeviceGroup {
|
||||
// deviceGroup without devices makes no sense -> return nil when no devices are provided
|
||||
if len(deviceList) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
devices := make([]*device.Device, len(deviceList))
|
||||
for index, dev := range deviceList {
|
||||
devices[index] = &device.Device{
|
||||
ID: dev.UUID,
|
||||
// all fingerprinted devices are "healthy" for now
|
||||
// to get real health data -> dcgm bindings should be used
|
||||
Healthy: true,
|
||||
HwLocality: &device.DeviceLocality{
|
||||
PciBusID: dev.PCIBusID,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
deviceGroup := &device.DeviceGroup{
|
||||
Vendor: vendor,
|
||||
Type: deviceType,
|
||||
Name: groupName,
|
||||
Devices: devices,
|
||||
// Assumption made that devices with the same DeviceName have the same
|
||||
// attributes like amount of memory, power, bar1memory etc
|
||||
Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
|
||||
}
|
||||
|
||||
// Extend attribute map with common attributes
|
||||
for attributeKey, attributeValue := range commonAttributes {
|
||||
deviceGroup.Attributes[attributeKey] = attributeValue
|
||||
}
|
||||
|
||||
return deviceGroup
|
||||
}
|
||||
|
||||
// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
|
||||
// struct to device.DeviceGroup.Attributes format (map[string]string)
|
||||
// this function performs all nil checks for FingerprintDeviceData pointers
|
||||
func attributesFromFingerprintDeviceData(fingerprintDeviceData *nvml.FingerprintDeviceData) map[string]string {
|
||||
// The following fields in FingerprintDeviceData are pointers, so they can be nil
|
||||
// In case they are nil -> return 'notAvailable' constant instead
|
||||
var (
|
||||
MemoryMiB string
|
||||
PowerW string
|
||||
BAR1MiB string
|
||||
CoresClockMHz string
|
||||
MemoryClockMHz string
|
||||
PCIBandwidthMBPerS string
|
||||
)
|
||||
|
||||
if fingerprintDeviceData.MemoryMiB == nil {
|
||||
MemoryMiB = notAvailable
|
||||
} else {
|
||||
MemoryMiB = fmt.Sprint(*fingerprintDeviceData.MemoryMiB)
|
||||
}
|
||||
|
||||
if fingerprintDeviceData.PowerW == nil {
|
||||
PowerW = notAvailable
|
||||
} else {
|
||||
PowerW = fmt.Sprint(*fingerprintDeviceData.PowerW)
|
||||
}
|
||||
|
||||
if fingerprintDeviceData.BAR1MiB == nil {
|
||||
BAR1MiB = notAvailable
|
||||
} else {
|
||||
BAR1MiB = fmt.Sprint(*fingerprintDeviceData.BAR1MiB)
|
||||
}
|
||||
|
||||
if fingerprintDeviceData.CoresClockMHz == nil {
|
||||
CoresClockMHz = notAvailable
|
||||
} else {
|
||||
CoresClockMHz = fmt.Sprint(*fingerprintDeviceData.CoresClockMHz)
|
||||
}
|
||||
|
||||
if fingerprintDeviceData.MemoryClockMHz == nil {
|
||||
MemoryClockMHz = notAvailable
|
||||
} else {
|
||||
MemoryClockMHz = fmt.Sprint(*fingerprintDeviceData.MemoryClockMHz)
|
||||
}
|
||||
|
||||
if fingerprintDeviceData.PCIBandwidthMBPerS == nil {
|
||||
PCIBandwidthMBPerS = notAvailable
|
||||
} else {
|
||||
PCIBandwidthMBPerS = fmt.Sprint(*fingerprintDeviceData.PCIBandwidthMBPerS)
|
||||
}
|
||||
|
||||
return map[string]string{
|
||||
DisplayStateAttr: fingerprintDeviceData.DisplayState,
|
||||
PersistenceModeAttr: fingerprintDeviceData.PersistenceMode,
|
||||
MemoryMiBAttr: MemoryMiB,
|
||||
PowerWAttr: PowerW,
|
||||
BAR1MiBAttr: BAR1MiB,
|
||||
CoresClockMHzAttr: CoresClockMHz,
|
||||
MemoryClockMHzAttr: MemoryClockMHz,
|
||||
PCIBandwidthMBPerSAttr: PCIBandwidthMBPerS,
|
||||
}
|
||||
|
||||
}
|
||||
1243
plugins/device/cmd/nvidia/fingerprint_test.go
Normal file
1243
plugins/device/cmd/nvidia/fingerprint_test.go
Normal file
File diff suppressed because it is too large
Load Diff
194
plugins/device/cmd/nvidia/nvml/client.go
Normal file
194
plugins/device/cmd/nvidia/nvml/client.go
Normal file
@@ -0,0 +1,194 @@
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// DeviceData represents common fields for Nvidia device
|
||||
type DeviceData struct {
|
||||
UUID string
|
||||
DeviceName *string
|
||||
MemoryMiB *uint64
|
||||
PowerW *uint
|
||||
BAR1MiB *uint64
|
||||
}
|
||||
|
||||
// FingerprintDeviceData is a superset of DeviceData
|
||||
// it describes device specific fields returned from
|
||||
// nvml queries during fingerprinting call
|
||||
type FingerprintDeviceData struct {
|
||||
*DeviceData
|
||||
PCIBandwidthMBPerS *uint
|
||||
CoresClockMHz *uint
|
||||
MemoryClockMHz *uint
|
||||
DisplayState string
|
||||
PersistenceMode string
|
||||
PCIBusID string
|
||||
}
|
||||
|
||||
// FingerprintData represets attributes of driver/devices
|
||||
type FingerprintData struct {
|
||||
Devices []*FingerprintDeviceData
|
||||
DriverVersion string
|
||||
}
|
||||
|
||||
// StatsData is a superset of DeviceData
|
||||
// it represents statistics data returned for every Nvidia device
|
||||
type StatsData struct {
|
||||
*DeviceData
|
||||
PowerUsageW *uint
|
||||
GPUUtilization *uint
|
||||
MemoryUtilization *uint
|
||||
EncoderUtilization *uint
|
||||
DecoderUtilization *uint
|
||||
TemperatureC *uint
|
||||
UsedMemoryMiB *uint64
|
||||
BAR1UsedMiB *uint64
|
||||
ECCErrorsL1Cache *uint64
|
||||
ECCErrorsL2Cache *uint64
|
||||
ECCErrorsDevice *uint64
|
||||
}
|
||||
|
||||
// NvmlClient describes how users would use nvml library
|
||||
type NvmlClient interface {
|
||||
GetFingerprintData() (*FingerprintData, error)
|
||||
GetStatsData() ([]*StatsData, error)
|
||||
}
|
||||
|
||||
// nvmlClient implements NvmlClient
|
||||
// Users of this lib are expected to use this struct via NewNvmlClient func
|
||||
type nvmlClient struct {
|
||||
driver NvmlDriver
|
||||
}
|
||||
|
||||
// NewNvmlClient function creates new nvmlClient with real
|
||||
// NvmlDriver implementation. Also, this func initializes NvmlDriver
|
||||
func NewNvmlClient() (*nvmlClient, error) {
|
||||
driver := &nvmlDriver{}
|
||||
err := driver.Initialize()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &nvmlClient{
|
||||
driver: driver,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetFingerprintData returns FingerprintData for available Nvidia devices
|
||||
func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
|
||||
/*
|
||||
nvml fields to be fingerprinted # nvml_library_call
|
||||
1 - Driver Version # nvmlSystemGetDriverVersion
|
||||
2 - Product Name # nvmlDeviceGetName
|
||||
3 - GPU UUID # nvmlDeviceGetUUID
|
||||
4 - Total Memory # nvmlDeviceGetMemoryInfo
|
||||
5 - Power # nvmlDeviceGetPowerManagementLimit
|
||||
6 - PCIBusID # nvmlDeviceGetPciInfo
|
||||
7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo(
|
||||
8 - PCI Bandwidth
|
||||
9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo
|
||||
10 - Display Mode # nvmlDeviceGetDisplayMode
|
||||
11 - Persistence Mode # nvmlDeviceGetPersistenceMode
|
||||
*/
|
||||
|
||||
// Assumed that this method is called with receiver retrieved from
|
||||
// NewNvmlClient
|
||||
// because this method handles initialization of NVML library
|
||||
|
||||
driverVersion, err := c.driver.SystemDriverVersion()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
|
||||
}
|
||||
|
||||
numDevices, err := c.driver.DeviceCount()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
|
||||
|
||||
for i := 0; i < int(numDevices); i++ {
|
||||
deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUResources[i] = &FingerprintDeviceData{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: deviceInfo.Name,
|
||||
UUID: deviceInfo.UUID,
|
||||
MemoryMiB: deviceInfo.MemoryMiB,
|
||||
PowerW: deviceInfo.PowerW,
|
||||
BAR1MiB: deviceInfo.BAR1MiB,
|
||||
},
|
||||
PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
|
||||
CoresClockMHz: deviceInfo.CoresClockMHz,
|
||||
MemoryClockMHz: deviceInfo.MemoryClockMHz,
|
||||
DisplayState: deviceInfo.DisplayState,
|
||||
PersistenceMode: deviceInfo.PersistenceMode,
|
||||
PCIBusID: deviceInfo.PCIBusID,
|
||||
}
|
||||
}
|
||||
return &FingerprintData{
|
||||
Devices: allNvidiaGPUResources,
|
||||
DriverVersion: driverVersion,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetStatsData returns statistics data for all devices on this machine
|
||||
func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
|
||||
/*
|
||||
nvml fields to be reported to stats api # nvml_library_call
|
||||
1 - Used Memory # nvmlDeviceGetMemoryInfo
|
||||
2 - Utilization of GPU # nvmlDeviceGetUtilizationRates
|
||||
3 - Utilization of Memory # nvmlDeviceGetUtilizationRates
|
||||
4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization
|
||||
5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization
|
||||
6 - Current GPU Temperature # nvmlDeviceGetTemperature
|
||||
7 - Power Draw # nvmlDeviceGetPowerUsage
|
||||
8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo
|
||||
9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter
|
||||
10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter
|
||||
11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
|
||||
*/
|
||||
|
||||
// Assumed that this method is called with receiver retrieved from
|
||||
// NewNvmlClient
|
||||
// because this method handles initialization of NVML library
|
||||
|
||||
numDevices, err := c.driver.DeviceCount()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUStats := make([]*StatsData, numDevices)
|
||||
|
||||
for i := 0; i < int(numDevices); i++ {
|
||||
deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
|
||||
}
|
||||
|
||||
allNvidiaGPUStats[i] = &StatsData{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: deviceInfo.Name,
|
||||
UUID: deviceInfo.UUID,
|
||||
MemoryMiB: deviceInfo.MemoryMiB,
|
||||
PowerW: deviceInfo.PowerW,
|
||||
BAR1MiB: deviceInfo.BAR1MiB,
|
||||
},
|
||||
PowerUsageW: deviceStatus.PowerUsageW,
|
||||
GPUUtilization: deviceStatus.GPUUtilization,
|
||||
MemoryUtilization: deviceStatus.MemoryUtilization,
|
||||
EncoderUtilization: deviceStatus.EncoderUtilization,
|
||||
DecoderUtilization: deviceStatus.DecoderUtilization,
|
||||
TemperatureC: deviceStatus.TemperatureC,
|
||||
UsedMemoryMiB: deviceStatus.UsedMemoryMiB,
|
||||
BAR1UsedMiB: deviceStatus.BAR1UsedMiB,
|
||||
ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache,
|
||||
ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache,
|
||||
ECCErrorsDevice: deviceStatus.ECCErrorsDevice,
|
||||
}
|
||||
}
|
||||
return allNvidiaGPUStats, nil
|
||||
}
|
||||
399
plugins/device/cmd/nvidia/nvml/client_test.go
Normal file
399
plugins/device/cmd/nvidia/nvml/client_test.go
Normal file
@@ -0,0 +1,399 @@
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type MockNVMLDriver struct {
|
||||
systemDriverCallSuccessful bool
|
||||
deviceCountCallSuccessful bool
|
||||
deviceInfoByIndexCallSuccessful bool
|
||||
deviceInfoAndStatusByIndexCallSuccessful bool
|
||||
driverVersion string
|
||||
devices []*DeviceInfo
|
||||
deviceStatus []*DeviceStatus
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) Initialize() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) Shutdown() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) SystemDriverVersion() (string, error) {
|
||||
if !m.systemDriverCallSuccessful {
|
||||
return "", errors.New("failed to get system driver")
|
||||
}
|
||||
return m.driverVersion, nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) DeviceCount() (uint, error) {
|
||||
if !m.deviceCountCallSuccessful {
|
||||
return 0, errors.New("failed to get device length")
|
||||
}
|
||||
return uint(len(m.devices)), nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
|
||||
if index >= uint(len(m.devices)) {
|
||||
return nil, errors.New("index is out of range")
|
||||
}
|
||||
if !m.deviceInfoByIndexCallSuccessful {
|
||||
return nil, errors.New("failed to get device info by index")
|
||||
}
|
||||
return m.devices[index], nil
|
||||
}
|
||||
|
||||
func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
|
||||
if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) {
|
||||
return nil, nil, errors.New("index is out of range")
|
||||
}
|
||||
if !m.deviceInfoAndStatusByIndexCallSuccessful {
|
||||
return nil, nil, errors.New("failed to get device info and status by index")
|
||||
}
|
||||
return m.devices[index], m.deviceStatus[index], nil
|
||||
}
|
||||
|
||||
func TestGetFingerprintDataFromNVML(t *testing.T) {
|
||||
for _, testCase := range []struct {
|
||||
Name string
|
||||
DriverConfiguration *MockNVMLDriver
|
||||
ExpectedError bool
|
||||
ExpectedResult *FingerprintData
|
||||
}{
|
||||
{
|
||||
Name: "fail on systemDriverCallSuccessful",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: false,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "fail on deviceCountCallSuccessful",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: false,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "fail on deviceInfoByIndexCall",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: false,
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "successful outcome",
|
||||
ExpectedError: false,
|
||||
ExpectedResult: &FingerprintData{
|
||||
DriverVersion: "driverVersion",
|
||||
Devices: []*FingerprintDeviceData{
|
||||
{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName1"),
|
||||
UUID: "UUID1",
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
},
|
||||
PCIBusID: "busId1",
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
}, {
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName2"),
|
||||
UUID: "UUID2",
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
},
|
||||
PCIBusID: "busId2",
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
},
|
||||
},
|
||||
},
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
driverVersion: "driverVersion",
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId1",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId2",
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
DisplayState: "Enabled",
|
||||
PersistenceMode: "Enabled",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
} {
|
||||
cli := nvmlClient{driver: testCase.DriverConfiguration}
|
||||
fingerprintData, err := cli.GetFingerprintData()
|
||||
if testCase.ExpectedError && err == nil {
|
||||
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
|
||||
}
|
||||
if !testCase.ExpectedError && err != nil {
|
||||
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
|
||||
}
|
||||
require.New(t).Equal(testCase.ExpectedResult, fingerprintData)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetStatsDataFromNVML(t *testing.T) {
|
||||
for _, testCase := range []struct {
|
||||
Name string
|
||||
DriverConfiguration *MockNVMLDriver
|
||||
ExpectedError bool
|
||||
ExpectedResult []*StatsData
|
||||
}{
|
||||
{
|
||||
Name: "fail on deviceCountCallSuccessful",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: false,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
deviceInfoAndStatusByIndexCallSuccessful: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "fail on DeviceInfoAndStatusByIndex call",
|
||||
ExpectedError: true,
|
||||
ExpectedResult: nil,
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
systemDriverCallSuccessful: true,
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoAndStatusByIndexCallSuccessful: false,
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId1",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId2",
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
},
|
||||
},
|
||||
deviceStatus: []*DeviceStatus{
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(1),
|
||||
GPUUtilization: helper.UintToPtr(1),
|
||||
MemoryUtilization: helper.UintToPtr(1),
|
||||
EncoderUtilization: helper.UintToPtr(1),
|
||||
DecoderUtilization: helper.UintToPtr(1),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(1),
|
||||
PowerUsageW: helper.UintToPtr(1),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(1),
|
||||
},
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(2),
|
||||
GPUUtilization: helper.UintToPtr(2),
|
||||
MemoryUtilization: helper.UintToPtr(2),
|
||||
EncoderUtilization: helper.UintToPtr(2),
|
||||
DecoderUtilization: helper.UintToPtr(2),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(2),
|
||||
PowerUsageW: helper.UintToPtr(2),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(2),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "successful outcome",
|
||||
ExpectedError: false,
|
||||
ExpectedResult: []*StatsData{
|
||||
{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName1"),
|
||||
UUID: "UUID1",
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
},
|
||||
TemperatureC: helper.UintToPtr(1),
|
||||
GPUUtilization: helper.UintToPtr(1),
|
||||
MemoryUtilization: helper.UintToPtr(1),
|
||||
EncoderUtilization: helper.UintToPtr(1),
|
||||
DecoderUtilization: helper.UintToPtr(1),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(1),
|
||||
PowerUsageW: helper.UintToPtr(1),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(1),
|
||||
},
|
||||
{
|
||||
DeviceData: &DeviceData{
|
||||
DeviceName: helper.StringToPtr("ModelName2"),
|
||||
UUID: "UUID2",
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
},
|
||||
TemperatureC: helper.UintToPtr(2),
|
||||
GPUUtilization: helper.UintToPtr(2),
|
||||
MemoryUtilization: helper.UintToPtr(2),
|
||||
EncoderUtilization: helper.UintToPtr(2),
|
||||
DecoderUtilization: helper.UintToPtr(2),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(2),
|
||||
PowerUsageW: helper.UintToPtr(2),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(2),
|
||||
},
|
||||
},
|
||||
DriverConfiguration: &MockNVMLDriver{
|
||||
deviceCountCallSuccessful: true,
|
||||
deviceInfoByIndexCallSuccessful: true,
|
||||
deviceInfoAndStatusByIndexCallSuccessful: true,
|
||||
devices: []*DeviceInfo{
|
||||
{
|
||||
UUID: "UUID1",
|
||||
Name: helper.StringToPtr("ModelName1"),
|
||||
MemoryMiB: helper.Uint64ToPtr(16),
|
||||
PCIBusID: "busId1",
|
||||
PowerW: helper.UintToPtr(100),
|
||||
BAR1MiB: helper.Uint64ToPtr(100),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(100),
|
||||
CoresClockMHz: helper.UintToPtr(100),
|
||||
MemoryClockMHz: helper.UintToPtr(100),
|
||||
}, {
|
||||
UUID: "UUID2",
|
||||
Name: helper.StringToPtr("ModelName2"),
|
||||
MemoryMiB: helper.Uint64ToPtr(8),
|
||||
PCIBusID: "busId2",
|
||||
PowerW: helper.UintToPtr(200),
|
||||
BAR1MiB: helper.Uint64ToPtr(200),
|
||||
PCIBandwidthMBPerS: helper.UintToPtr(200),
|
||||
CoresClockMHz: helper.UintToPtr(200),
|
||||
MemoryClockMHz: helper.UintToPtr(200),
|
||||
},
|
||||
},
|
||||
deviceStatus: []*DeviceStatus{
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(1),
|
||||
GPUUtilization: helper.UintToPtr(1),
|
||||
MemoryUtilization: helper.UintToPtr(1),
|
||||
EncoderUtilization: helper.UintToPtr(1),
|
||||
DecoderUtilization: helper.UintToPtr(1),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(1),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(1),
|
||||
PowerUsageW: helper.UintToPtr(1),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(1),
|
||||
},
|
||||
{
|
||||
TemperatureC: helper.UintToPtr(2),
|
||||
GPUUtilization: helper.UintToPtr(2),
|
||||
MemoryUtilization: helper.UintToPtr(2),
|
||||
EncoderUtilization: helper.UintToPtr(2),
|
||||
DecoderUtilization: helper.UintToPtr(2),
|
||||
UsedMemoryMiB: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL1Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsL2Cache: helper.Uint64ToPtr(2),
|
||||
ECCErrorsDevice: helper.Uint64ToPtr(2),
|
||||
PowerUsageW: helper.UintToPtr(2),
|
||||
BAR1UsedMiB: helper.Uint64ToPtr(2),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
} {
|
||||
cli := nvmlClient{driver: testCase.DriverConfiguration}
|
||||
statsData, err := cli.GetStatsData()
|
||||
if testCase.ExpectedError && err == nil {
|
||||
t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
|
||||
}
|
||||
if !testCase.ExpectedError && err != nil {
|
||||
t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
|
||||
}
|
||||
require.New(t).Equal(testCase.ExpectedResult, statsData)
|
||||
}
|
||||
}
|
||||
138
plugins/device/cmd/nvidia/nvml/driver.go
Normal file
138
plugins/device/cmd/nvidia/nvml/driver.go
Normal file
@@ -0,0 +1,138 @@
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
|
||||
)
|
||||
|
||||
// DeviceInfo represents nvml device data
|
||||
// this struct is returned by NvmlDriver DeviceInfoByIndex and
|
||||
// DeviceInfoAndStatusByIndex methods
|
||||
type DeviceInfo struct {
|
||||
// The following fields are guaranteed to be retrieved from nvml
|
||||
UUID string
|
||||
PCIBusID string
|
||||
DisplayState string
|
||||
PersistenceMode string
|
||||
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
Name *string
|
||||
MemoryMiB *uint64
|
||||
PowerW *uint
|
||||
BAR1MiB *uint64
|
||||
PCIBandwidthMBPerS *uint
|
||||
CoresClockMHz *uint
|
||||
MemoryClockMHz *uint
|
||||
}
|
||||
|
||||
// DeviceStatus represents nvml device status
|
||||
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
|
||||
type DeviceStatus struct {
|
||||
// The following fields can be nil after call to nvml, because nvml was
|
||||
// not able to retrieve this fields for specific nvidia card
|
||||
PowerUsageW *uint
|
||||
TemperatureC *uint
|
||||
GPUUtilization *uint // %
|
||||
MemoryUtilization *uint // %
|
||||
EncoderUtilization *uint // %
|
||||
DecoderUtilization *uint // %
|
||||
BAR1UsedMiB *uint64
|
||||
UsedMemoryMiB *uint64
|
||||
ECCErrorsL1Cache *uint64
|
||||
ECCErrorsL2Cache *uint64
|
||||
ECCErrorsDevice *uint64
|
||||
}
|
||||
|
||||
// NvmlDriver represents set of methods to query nvml library
|
||||
type NvmlDriver interface {
|
||||
Initialize() error
|
||||
Shutdown() error
|
||||
SystemDriverVersion() (string, error)
|
||||
DeviceCount() (uint, error)
|
||||
DeviceInfoByIndex(uint) (*DeviceInfo, error)
|
||||
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
|
||||
}
|
||||
|
||||
// nvmlDriver implements NvmlDriver
|
||||
// Users are required to call Initialize method before using any other methods
|
||||
type nvmlDriver struct{}
|
||||
|
||||
// Initialize nvml library by locating nvml shared object file and calling ldopen
|
||||
func (n *nvmlDriver) Initialize() error {
|
||||
return nvml.Init()
|
||||
}
|
||||
|
||||
// Shutdown stops any further interaction with nvml
|
||||
func (n *nvmlDriver) Shutdown() error {
|
||||
return nvml.Shutdown()
|
||||
}
|
||||
|
||||
// SystemDriverVersion returns installed driver version
|
||||
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
|
||||
return nvml.GetDriverVersion()
|
||||
}
|
||||
|
||||
// DeviceCount reports number of available GPU devices
|
||||
func (n *nvmlDriver) DeviceCount() (uint, error) {
|
||||
return nvml.GetDeviceCount()
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
|
||||
device, err := nvml.NewDevice(index)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
deviceMode, err := device.GetDeviceMode()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &DeviceInfo{
|
||||
UUID: device.UUID,
|
||||
Name: device.Model,
|
||||
MemoryMiB: device.Memory,
|
||||
PowerW: device.Power,
|
||||
BAR1MiB: device.PCI.BAR1,
|
||||
PCIBandwidthMBPerS: device.PCI.Bandwidth,
|
||||
PCIBusID: device.PCI.BusID,
|
||||
CoresClockMHz: device.Clocks.Cores,
|
||||
MemoryClockMHz: device.Clocks.Memory,
|
||||
DisplayState: deviceMode.DisplayInfo.Mode.String(),
|
||||
PersistenceMode: deviceMode.Persistence.String(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
|
||||
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
|
||||
device, err := nvml.NewDevice(index)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
status, err := device.Status()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return &DeviceInfo{
|
||||
UUID: device.UUID,
|
||||
Name: device.Model,
|
||||
MemoryMiB: device.Memory,
|
||||
PowerW: device.Power,
|
||||
BAR1MiB: device.PCI.BAR1,
|
||||
PCIBandwidthMBPerS: device.PCI.Bandwidth,
|
||||
PCIBusID: device.PCI.BusID,
|
||||
CoresClockMHz: device.Clocks.Cores,
|
||||
MemoryClockMHz: device.Clocks.Memory,
|
||||
}, &DeviceStatus{
|
||||
TemperatureC: status.Temperature,
|
||||
GPUUtilization: status.Utilization.GPU,
|
||||
MemoryUtilization: status.Utilization.Memory,
|
||||
EncoderUtilization: status.Utilization.Encoder,
|
||||
DecoderUtilization: status.Utilization.Decoder,
|
||||
UsedMemoryMiB: status.Memory.Global.Used,
|
||||
ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache,
|
||||
ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache,
|
||||
ECCErrorsDevice: status.Memory.ECCErrors.Device,
|
||||
PowerUsageW: status.Power,
|
||||
BAR1UsedMiB: status.PCI.BAR1Used,
|
||||
}, nil
|
||||
}
|
||||
301
plugins/device/cmd/nvidia/stats.go
Normal file
301
plugins/device/cmd/nvidia/stats.go
Normal file
@@ -0,0 +1,301 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/plugins/device"
|
||||
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
|
||||
)
|
||||
|
||||
const (
|
||||
// Attribute names for reporting stats output
|
||||
PowerUsageAttr = "Power usage"
|
||||
PowerUsageUnit = "W"
|
||||
PowerUsageDesc = "Power usage for this GPU in watts and " +
|
||||
"its associated circuitry (e.g. memory) / Maximum GPU Power"
|
||||
GPUUtilizationAttr = "GPU utilization"
|
||||
GPUUtilizationUnit = "%"
|
||||
GPUUtilizationDesc = "Percent of time over the past sample period " +
|
||||
"during which one or more kernels were executing on the GPU."
|
||||
MemoryUtilizationAttr = "Memory utilization"
|
||||
MemoryUtilizationUnit = "%"
|
||||
MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period"
|
||||
EncoderUtilizationAttr = "Encoder utilization"
|
||||
EncoderUtilizationUnit = "%"
|
||||
EncoderUtilizationDesc = "Percent of time over the past sample period " +
|
||||
"during which GPU Encoder was used"
|
||||
DecoderUtilizationAttr = "Decoder utilization"
|
||||
DecoderUtilizationUnit = "%"
|
||||
DecoderUtilizationDesc = "Percent of time over the past sample period " +
|
||||
"during which GPU Decoder was used"
|
||||
TemperatureAttr = "Temperature"
|
||||
TemperatureUnit = "C" // Celsius degrees
|
||||
TemperatureDesc = "Temperature of the Unit"
|
||||
MemoryStateAttr = "Memory state"
|
||||
MemoryStateUnit = "MiB" // Mebibytes
|
||||
MemoryStateDesc = "UsedMemory / TotalMemory"
|
||||
BAR1StateAttr = "BAR1 buffer state"
|
||||
BAR1StateUnit = "MiB" // Mebibytes
|
||||
BAR1StateDesc = "UsedBAR1 / TotalBAR1"
|
||||
ECCErrorsL1CacheAttr = "ECC L1 errors"
|
||||
ECCErrorsL1CacheUnit = "#" // number of errors
|
||||
ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
|
||||
ECCErrorsL2CacheAttr = "ECC L2 errors"
|
||||
ECCErrorsL2CacheUnit = "#" // number of errors
|
||||
ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
|
||||
ECCErrorsDeviceAttr = "ECC memory errors"
|
||||
ECCErrorsDeviceUnit = "#" // number of errors
|
||||
ECCErrorsDeviceDesc = "Requested memory error counter for the device"
|
||||
)
|
||||
|
||||
// stats is the long running goroutine that streams device statistics
|
||||
func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) {
|
||||
defer close(stats)
|
||||
|
||||
if d.nvmlClientInitializationError != nil {
|
||||
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError)
|
||||
return
|
||||
}
|
||||
|
||||
// Create a timer that will fire immediately for the first detection
|
||||
ticker := time.NewTimer(0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
ticker.Reset(d.statsPeriod)
|
||||
}
|
||||
|
||||
d.writeStatsToChannel(stats, time.Now())
|
||||
}
|
||||
}
|
||||
|
||||
// filterStatsByID accepts list of StatsData and set of IDs
|
||||
// this function would return entries from StatsData with IDs found in the set
|
||||
func filterStatsByID(stats []*nvml.StatsData, IDs map[string]struct{}) []*nvml.StatsData {
|
||||
var filteredStats []*nvml.StatsData
|
||||
for _, statsItem := range stats {
|
||||
if _, ok := IDs[statsItem.UUID]; ok {
|
||||
filteredStats = append(filteredStats, statsItem)
|
||||
}
|
||||
}
|
||||
return filteredStats
|
||||
}
|
||||
|
||||
// writeStatsToChannel collects StatsData from NVML backend, groups StatsData
|
||||
// by DeviceName attribute, populates DeviceGroupStats structure for every group
|
||||
// and sends data over provided channel
|
||||
func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
|
||||
statsData, err := d.nvmlClient.GetStatsData()
|
||||
if err != nil {
|
||||
d.logger.Error("failed to get nvidia stats", "error", err)
|
||||
stats <- &device.StatsResponse{
|
||||
Error: err,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// filter only stats from devices that are stored in NvidiaDevice struct
|
||||
d.deviceLock.RLock()
|
||||
statsData = filterStatsByID(statsData, d.devices)
|
||||
d.deviceLock.RUnlock()
|
||||
|
||||
// group stats by DeviceName struct field
|
||||
statsListByDeviceName := make(map[string][]*nvml.StatsData)
|
||||
for _, statsItem := range statsData {
|
||||
deviceName := statsItem.DeviceName
|
||||
if deviceName == nil {
|
||||
// nvml driver was not able to detect device name. This kind
|
||||
// of devices are placed to single group with 'notAvailable' name
|
||||
notAvailableCopy := notAvailable
|
||||
deviceName = ¬AvailableCopy
|
||||
}
|
||||
|
||||
statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
|
||||
}
|
||||
|
||||
// place data device.DeviceGroupStats struct for every group of stats
|
||||
deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
|
||||
for groupName, groupStats := range statsListByDeviceName {
|
||||
deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
|
||||
}
|
||||
|
||||
stats <- &device.StatsResponse{
|
||||
Groups: deviceGroupsStats,
|
||||
}
|
||||
}
|
||||
|
||||
func newNotAvailableDeviceStats(unit, desc string) *device.StatValue {
|
||||
return &device.StatValue{Unit: unit, Desc: desc, StringVal: notAvailable}
|
||||
}
|
||||
|
||||
// statsForGroup is a helper function that populates device.DeviceGroupStats
|
||||
// for given groupName with groupStats list
|
||||
func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
|
||||
instanceStats := make(map[string]*device.DeviceStats)
|
||||
for _, statsItem := range groupStats {
|
||||
instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
|
||||
}
|
||||
|
||||
return &device.DeviceGroupStats{
|
||||
Vendor: vendor,
|
||||
Type: deviceType,
|
||||
Name: groupName,
|
||||
InstanceStats: instanceStats,
|
||||
}
|
||||
}
|
||||
|
||||
// statsForItem is a helper function that populates device.DeviceStats for given
|
||||
// nvml.StatsData
|
||||
func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
|
||||
// nvml.StatsData holds pointers to values that can be nil
|
||||
// In case they are nil return stats with 'notAvailable' constant
|
||||
var (
|
||||
powerUsageStat *device.StatValue
|
||||
GPUUtilizationStat *device.StatValue
|
||||
memoryUtilizationStat *device.StatValue
|
||||
encoderUtilizationStat *device.StatValue
|
||||
decoderUtilizationStat *device.StatValue
|
||||
temperatureStat *device.StatValue
|
||||
memoryStateStat *device.StatValue
|
||||
BAR1StateStat *device.StatValue
|
||||
ECCErrorsL1CacheStat *device.StatValue
|
||||
ECCErrorsL2CacheStat *device.StatValue
|
||||
ECCErrorsDeviceStat *device.StatValue
|
||||
)
|
||||
|
||||
if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
|
||||
powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
|
||||
} else {
|
||||
powerUsageStat = &device.StatValue{
|
||||
Unit: PowerUsageUnit,
|
||||
Desc: PowerUsageDesc,
|
||||
IntNumeratorVal: int64(*statsItem.PowerUsageW),
|
||||
IntDenominatorVal: int64(*statsItem.PowerW),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.GPUUtilization == nil {
|
||||
GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
|
||||
} else {
|
||||
GPUUtilizationStat = &device.StatValue{
|
||||
Unit: GPUUtilizationUnit,
|
||||
Desc: GPUUtilizationDesc,
|
||||
IntNumeratorVal: int64(*statsItem.GPUUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.MemoryUtilization == nil {
|
||||
memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
|
||||
} else {
|
||||
memoryUtilizationStat = &device.StatValue{
|
||||
Unit: MemoryUtilizationUnit,
|
||||
Desc: MemoryUtilizationDesc,
|
||||
IntNumeratorVal: int64(*statsItem.MemoryUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.EncoderUtilization == nil {
|
||||
encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
|
||||
} else {
|
||||
encoderUtilizationStat = &device.StatValue{
|
||||
Unit: EncoderUtilizationUnit,
|
||||
Desc: EncoderUtilizationDesc,
|
||||
IntNumeratorVal: int64(*statsItem.EncoderUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.DecoderUtilization == nil {
|
||||
decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
|
||||
} else {
|
||||
decoderUtilizationStat = &device.StatValue{
|
||||
Unit: DecoderUtilizationUnit,
|
||||
Desc: DecoderUtilizationDesc,
|
||||
IntNumeratorVal: int64(*statsItem.DecoderUtilization),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.TemperatureC == nil {
|
||||
temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
|
||||
} else {
|
||||
temperatureStat = &device.StatValue{
|
||||
Unit: TemperatureUnit,
|
||||
Desc: TemperatureDesc,
|
||||
IntNumeratorVal: int64(*statsItem.TemperatureC),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
|
||||
memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
|
||||
} else {
|
||||
memoryStateStat = &device.StatValue{
|
||||
Unit: MemoryStateUnit,
|
||||
Desc: MemoryStateDesc,
|
||||
IntNumeratorVal: int64(*statsItem.UsedMemoryMiB),
|
||||
IntDenominatorVal: int64(*statsItem.MemoryMiB),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
|
||||
BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
|
||||
} else {
|
||||
BAR1StateStat = &device.StatValue{
|
||||
Unit: BAR1StateUnit,
|
||||
Desc: BAR1StateDesc,
|
||||
IntNumeratorVal: int64(*statsItem.BAR1UsedMiB),
|
||||
IntDenominatorVal: int64(*statsItem.BAR1MiB),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.ECCErrorsL1Cache == nil {
|
||||
ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
|
||||
} else {
|
||||
ECCErrorsL1CacheStat = &device.StatValue{
|
||||
Unit: ECCErrorsL1CacheUnit,
|
||||
Desc: ECCErrorsL1CacheDesc,
|
||||
IntNumeratorVal: int64(*statsItem.ECCErrorsL1Cache),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.ECCErrorsL2Cache == nil {
|
||||
ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
|
||||
} else {
|
||||
ECCErrorsL2CacheStat = &device.StatValue{
|
||||
Unit: ECCErrorsL2CacheUnit,
|
||||
Desc: ECCErrorsL2CacheDesc,
|
||||
IntNumeratorVal: int64(*statsItem.ECCErrorsL2Cache),
|
||||
}
|
||||
}
|
||||
|
||||
if statsItem.ECCErrorsDevice == nil {
|
||||
ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
|
||||
} else {
|
||||
ECCErrorsDeviceStat = &device.StatValue{
|
||||
Unit: ECCErrorsDeviceUnit,
|
||||
Desc: ECCErrorsDeviceDesc,
|
||||
IntNumeratorVal: int64(*statsItem.ECCErrorsDevice),
|
||||
}
|
||||
}
|
||||
return &device.DeviceStats{
|
||||
Summary: temperatureStat,
|
||||
Stats: &device.StatObject{
|
||||
Attributes: map[string]*device.StatValue{
|
||||
PowerUsageAttr: powerUsageStat,
|
||||
GPUUtilizationAttr: GPUUtilizationStat,
|
||||
MemoryUtilizationAttr: memoryUtilizationStat,
|
||||
EncoderUtilizationAttr: encoderUtilizationStat,
|
||||
DecoderUtilizationAttr: decoderUtilizationStat,
|
||||
TemperatureAttr: temperatureStat,
|
||||
MemoryStateAttr: memoryStateStat,
|
||||
BAR1StateAttr: BAR1StateStat,
|
||||
ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat,
|
||||
ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat,
|
||||
ECCErrorsDeviceAttr: ECCErrorsDeviceStat,
|
||||
},
|
||||
},
|
||||
Timestamp: timestamp,
|
||||
}
|
||||
}
|
||||
3016
plugins/device/cmd/nvidia/stats_test.go
Normal file
3016
plugins/device/cmd/nvidia/stats_test.go
Normal file
File diff suppressed because it is too large
Load Diff
160
vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA
generated
vendored
Normal file
160
vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA
generated
vendored
Normal file
@@ -0,0 +1,160 @@
|
||||
GPU Monitoring Tools
|
||||
Software Grant and Corporate Contributor License Agreement ("Agreement")
|
||||
|
||||
Thank you for your interest in the gpu-monitoring-tools Project (the
|
||||
"Project"). In order to clarify the intellectual property license
|
||||
granted with Contributions from any person or entity, NVIDIA
|
||||
Corporation (the “Copyright Holders") must have a Contributor License
|
||||
Agreement (CLA) on file that has been signed by each Contributor,
|
||||
indicating agreement to the license terms below. This license is
|
||||
for your protection as a Contributor as well as the protection of the
|
||||
Project and its users; it does not change your rights to use your own
|
||||
Contributions for any other purpose.
|
||||
|
||||
This version of the Agreement allows an entity (the "Corporation") to
|
||||
submit Contributions to the Project, to authorize Contributions
|
||||
submitted by its designated employees to the Project, and to grant
|
||||
copyright and patent licenses thereto to the Copyright Holders.
|
||||
|
||||
If you have not already done so, please complete and sign, then scan and
|
||||
email a pdf file of this Agreement to digits@nvidia.com.
|
||||
Please read this document carefully before signing and keep a copy for
|
||||
your records.
|
||||
|
||||
Corporation name: ________________________________________________
|
||||
|
||||
Corporation address: ________________________________________________
|
||||
|
||||
________________________________________________
|
||||
|
||||
________________________________________________
|
||||
|
||||
Point of Contact: ________________________________________________
|
||||
|
||||
E-Mail: ________________________________________________
|
||||
|
||||
Telephone: _____________________ Fax: _____________________
|
||||
|
||||
|
||||
You accept and agree to the following terms and conditions for Your
|
||||
present and future Contributions submitted to the Project. In
|
||||
return, the Copyright Holders shall not use Your Contributions in a way
|
||||
that is contrary to the public benefit or inconsistent with its nonprofit
|
||||
status and bylaws in effect at the time of the Contribution. Except
|
||||
for the license granted herein to the Copyright Holders and recipients of
|
||||
software distributed by the Copyright Holders, You reserve all right, title,
|
||||
and interest in and to Your Contributions.
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"You" (or "Your") shall mean the copyright owner or legal entity
|
||||
authorized by the copyright owner that is making this Agreement
|
||||
with the Copyright Holders. For legal entities, the entity making a
|
||||
Contribution and all other entities that control, are controlled by,
|
||||
or are under common control with that entity are considered to be a
|
||||
single Contributor. For the purposes of this definition, "control"
|
||||
means (i) the power, direct or indirect, to cause the direction or
|
||||
management of such entity, whether by contract or otherwise, or
|
||||
(ii) ownership of fifty percent (50%) or more of the outstanding
|
||||
shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"Contribution" shall mean the code, documentation or other original
|
||||
works of authorship expressly identified in Schedule B, as well as
|
||||
any original work of authorship, including
|
||||
any modifications or additions to an existing work, that is intentionally
|
||||
submitted by You to the Copyright Holders for inclusion in, or
|
||||
documentation of, any of the products owned or managed by the
|
||||
Copyright Holders (the "Work"). For the purposes of this definition,
|
||||
"submitted" means any form of electronic, verbal, or written
|
||||
communication sent to the Copyright Holders or its representatives,
|
||||
including but not limited to communication on electronic mailing
|
||||
lists, source code control systems, and issue tracking systems
|
||||
that are managed by, or on behalf of, the Copyright Holders for the
|
||||
purpose of discussing and improving the Work, but excluding
|
||||
communication that is conspicuously marked or otherwise designated
|
||||
in writing by You as "Not a Contribution."
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions
|
||||
of this Agreement, You hereby grant to the Copyright Holders and to
|
||||
recipients of software distributed by the Copyright Holders a
|
||||
perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
||||
irrevocable copyright license to reproduce, prepare derivative works
|
||||
of, publicly display, publicly perform, sublicense, and distribute
|
||||
Your Contributions and such derivative works.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this Agreement, You hereby grant to the Copyright Holders and to
|
||||
recipients of software distributed by the Copyright Holders
|
||||
a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
||||
irrevocable (except as stated in this section) patent license
|
||||
to make, have made, use, offer to sell, sell, import, and otherwise
|
||||
transfer the Work, where such license applies only to those
|
||||
patent claims licensable by You that are necessarily infringed
|
||||
by Your Contribution(s) alone or by combination of Your Contribution(s)
|
||||
with the Work to which such Contribution(s) were submitted.
|
||||
If any entity institutes patent litigation against You or any
|
||||
other entity (including a cross-claim or counterclaim in a lawsuit)
|
||||
alleging that your Contribution, or the Work to which you have
|
||||
contributed, constitutes direct or contributory patent infringement,
|
||||
then any patent licenses granted to that entity under this Agreement
|
||||
for that Contribution or Work shall terminate as of the date such
|
||||
litigation is filed.
|
||||
|
||||
4. You represent that You are legally entitled to grant the above
|
||||
license. You represent further that each employee of the
|
||||
Corporation designated on Schedule A below (or in a subsequent
|
||||
written modification to that Schedule) is authorized to submit
|
||||
Contributions on behalf of the Corporation.
|
||||
|
||||
5. You represent that each of Your Contributions is Your original
|
||||
creation (see section 7 for submissions on behalf of others).
|
||||
|
||||
6. You are not expected to provide support for Your Contributions,
|
||||
except to the extent You desire to provide support. You may provide
|
||||
support for free, for a fee, or not at all. Unless required by
|
||||
applicable law or agreed to in writing, You provide Your
|
||||
Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
|
||||
OF ANY KIND, either express or implied, including, without
|
||||
limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
|
||||
MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
7. Should You wish to submit work that is not Your original creation,
|
||||
You may submit it to the Copyright Holders separately from any
|
||||
Contribution, identifying the complete details of its source and
|
||||
of any license or other restriction (including, but not limited
|
||||
to, related patents, trademarks, and license agreements) of which
|
||||
you are personally aware, and conspicuously marking the work as
|
||||
"Submitted on behalf of a third-party: [named here]".
|
||||
|
||||
8. It is your responsibility to notify the Copyright Holders when any change
|
||||
is required to the list of designated employees authorized to submit
|
||||
Contributions on behalf of the Corporation, or to the Corporation's
|
||||
Point of Contact with the Copyright Holders.
|
||||
|
||||
|
||||
|
||||
Please sign: __________________________________ Date: _______________
|
||||
|
||||
Title: __________________________________
|
||||
|
||||
Corporation: __________________________________
|
||||
|
||||
|
||||
|
||||
|
||||
Schedule A
|
||||
|
||||
[Initial list of designated employees. NB: authorization is not
|
||||
tied to particular Contributions.]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Schedule B
|
||||
|
||||
[Identification of optional concurrent software grant. Would be
|
||||
left blank or omitted if there is no concurrent software grant.]
|
||||
|
||||
|
||||
29
vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
generated
vendored
Normal file
29
vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2018, NVIDIA Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
34
vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md
generated
vendored
Normal file
34
vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md
generated
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
# NVIDIA GPU Monitoring Tools
|
||||
|
||||
## NVML Go Bindings
|
||||
|
||||
[NVIDIA Management Library (NVML)](https://developer.nvidia.com/nvidia-management-library-nvml) is a C-based API for monitoring and managing NVIDIA GPU devices.
|
||||
NVML go bindings are taken from [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) with some improvements and additions. NVML headers are also added to the package to make it easy to use and build.
|
||||
|
||||
### NVML Samples
|
||||
Three [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/nvml/README.md) are included to demonstrate how to use the NVML API.
|
||||
|
||||
|
||||
## DCGM Go Bindings
|
||||
|
||||
[NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting.
|
||||
|
||||
DCGM go bindings makes administering and monitoring containerized GPU applications easy.
|
||||
|
||||
### DCGM Samples
|
||||
|
||||
DCGM can be run in different modes, seven [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/README.md) and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) are included for showing how to use the DCGM API and run it in different modes.
|
||||
|
||||
|
||||
## DCGM exporter
|
||||
|
||||
GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a simple shell script that starts nv-hostengine, reads GPU metrics every 1 second and converts it to a standard Prometheus format.
|
||||
|
||||
Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md).
|
||||
|
||||
## Issues and Contributing
|
||||
|
||||
A signed copy of the [Contributor License Agreement](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/CLA) needs to be provided to <a href="mailto:digits@nvidia.com">digits@nvidia.com</a> before any change can be accepted.
|
||||
|
||||
* Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new)
|
||||
* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/)
|
||||
634
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
generated
vendored
Normal file
634
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
generated
vendored
Normal file
@@ -0,0 +1,634 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
package nvml
|
||||
|
||||
// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
|
||||
// #include "nvml_dl.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
|
||||
szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
|
||||
szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
|
||||
szProcs = 32
|
||||
szProcName = 64
|
||||
|
||||
XidCriticalError = C.nvmlEventTypeXidCriticalError
|
||||
)
|
||||
|
||||
type handle struct{ dev C.nvmlDevice_t }
|
||||
type EventSet struct{ set C.nvmlEventSet_t }
|
||||
type Event struct {
|
||||
UUID *string
|
||||
Etype uint64
|
||||
Edata uint64
|
||||
}
|
||||
|
||||
func uintPtr(c C.uint) *uint {
|
||||
i := uint(c)
|
||||
return &i
|
||||
}
|
||||
|
||||
func uint64Ptr(c C.ulonglong) *uint64 {
|
||||
i := uint64(c)
|
||||
return &i
|
||||
}
|
||||
|
||||
func stringPtr(c *C.char) *string {
|
||||
s := C.GoString(c)
|
||||
return &s
|
||||
}
|
||||
|
||||
func errorString(ret C.nvmlReturn_t) error {
|
||||
if ret == C.NVML_SUCCESS {
|
||||
return nil
|
||||
}
|
||||
err := C.GoString(C.nvmlErrorString(ret))
|
||||
return fmt.Errorf("nvml: %v", err)
|
||||
}
|
||||
|
||||
func init_() error {
|
||||
r := C.nvmlInit_dl()
|
||||
if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
|
||||
return errors.New("could not load NVML library")
|
||||
}
|
||||
return errorString(r)
|
||||
}
|
||||
|
||||
func NewEventSet() EventSet {
|
||||
var set C.nvmlEventSet_t
|
||||
C.nvmlEventSetCreate(&set)
|
||||
|
||||
return EventSet{set}
|
||||
}
|
||||
|
||||
func RegisterEvent(es EventSet, event int) error {
|
||||
n, err := deviceGetCount()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var i uint
|
||||
for i = 0; i < n; i++ {
|
||||
h, err := deviceGetHandleByIndex(i)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
|
||||
if r != C.NVML_SUCCESS {
|
||||
return errorString(r)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func RegisterEventForDevice(es EventSet, event int, uuid string) error {
|
||||
n, err := deviceGetCount()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var i uint
|
||||
for i = 0; i < n; i++ {
|
||||
h, err := deviceGetHandleByIndex(i)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
duuid, err := h.deviceGetUUID()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if *duuid != uuid {
|
||||
continue
|
||||
}
|
||||
|
||||
r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
|
||||
if r != C.NVML_SUCCESS {
|
||||
return errorString(r)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("nvml: device not found")
|
||||
}
|
||||
|
||||
func DeleteEventSet(es EventSet) {
|
||||
C.nvmlEventSetFree(es.set)
|
||||
}
|
||||
|
||||
func WaitForEvent(es EventSet, timeout uint) (Event, error) {
|
||||
var data C.nvmlEventData_t
|
||||
|
||||
r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
|
||||
uuid, _ := handle{data.device}.deviceGetUUID()
|
||||
|
||||
return Event{
|
||||
UUID: uuid,
|
||||
Etype: uint64(data.eventType),
|
||||
Edata: uint64(data.eventData),
|
||||
},
|
||||
errorString(r)
|
||||
}
|
||||
|
||||
func shutdown() error {
|
||||
return errorString(C.nvmlShutdown_dl())
|
||||
}
|
||||
|
||||
func systemGetDriverVersion() (string, error) {
|
||||
var driver [szDriver]C.char
|
||||
|
||||
r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
|
||||
return C.GoString(&driver[0]), errorString(r)
|
||||
}
|
||||
|
||||
func systemGetProcessName(pid uint) (string, error) {
|
||||
var proc [szProcName]C.char
|
||||
|
||||
r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
|
||||
return C.GoString(&proc[0]), errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetCount() (uint, error) {
|
||||
var n C.uint
|
||||
|
||||
r := C.nvmlDeviceGetCount(&n)
|
||||
return uint(n), errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetHandleByIndex(idx uint) (handle, error) {
|
||||
var dev C.nvmlDevice_t
|
||||
|
||||
r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
|
||||
return handle{dev}, errorString(r)
|
||||
}
|
||||
|
||||
func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
|
||||
var level C.nvmlGpuTopologyLevel_t
|
||||
|
||||
r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
|
||||
if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(C.uint(level)), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetName() (*string, error) {
|
||||
var name [szName]C.char
|
||||
|
||||
r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&name[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetUUID() (*string, error) {
|
||||
var uuid [szUUID]C.char
|
||||
|
||||
r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&uuid[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPciInfo() (*string, error) {
|
||||
var pci C.nvmlPciInfo_t
|
||||
|
||||
r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return stringPtr(&pci.busId[0]), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMinorNumber() (*uint, error) {
|
||||
var minor C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(minor), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
|
||||
var bar1 C.nvmlBAR1Memory_t
|
||||
|
||||
r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
|
||||
var power C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(power), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
|
||||
var sm, mem C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
|
||||
}
|
||||
return uintPtr(sm), uintPtr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
|
||||
var link C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(link), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
|
||||
var width C.uint
|
||||
|
||||
r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(width), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPowerUsage() (*uint, error) {
|
||||
var power C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(power), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetTemperature() (*uint, error) {
|
||||
var temp C.uint
|
||||
|
||||
r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(temp), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
|
||||
var usage C.nvmlUtilization_t
|
||||
|
||||
r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetEncoderUtilization() (*uint, error) {
|
||||
var usage, sampling C.uint
|
||||
|
||||
r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(usage), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetDecoderUtilization() (*uint, error) {
|
||||
var usage, sampling C.uint
|
||||
|
||||
r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil
|
||||
}
|
||||
return uintPtr(usage), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
|
||||
var mem C.nvmlMemory_t
|
||||
|
||||
r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
err = errorString(r)
|
||||
if r != C.NVML_SUCCESS {
|
||||
return
|
||||
}
|
||||
|
||||
totalMem = uint64Ptr(mem.total)
|
||||
if totalMem != nil {
|
||||
*totalMem /= 1024 * 1024 // MiB
|
||||
}
|
||||
|
||||
devMem = DeviceMemory{
|
||||
Used: uint64Ptr(mem.used),
|
||||
Free: uint64Ptr(mem.free),
|
||||
}
|
||||
|
||||
if devMem.Used != nil {
|
||||
*devMem.Used /= 1024 * 1024 // MiB
|
||||
}
|
||||
|
||||
if devMem.Free != nil {
|
||||
*devMem.Free /= 1024 * 1024 // MiB
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
|
||||
var sm, mem C.uint
|
||||
|
||||
r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
|
||||
}
|
||||
return uintPtr(sm), uintPtr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
|
||||
var l1, l2, mem C.ulonglong
|
||||
|
||||
r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
||||
C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
|
||||
}
|
||||
return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
|
||||
var rx, tx C.uint
|
||||
|
||||
r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
if r == C.NVML_SUCCESS {
|
||||
r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
|
||||
}
|
||||
return uintPtr(rx), uintPtr(tx), errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
|
||||
var procs [szProcs]C.nvmlProcessInfo_t
|
||||
var count = C.uint(szProcs)
|
||||
|
||||
r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
n := int(count)
|
||||
pids := make([]uint, n)
|
||||
mems := make([]uint64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
pids[i] = uint(procs[i].pid)
|
||||
mems[i] = uint64(procs[i].usedGpuMemory)
|
||||
}
|
||||
return pids, mems, errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
|
||||
var procs [szProcs]C.nvmlProcessInfo_t
|
||||
var count = C.uint(szProcs)
|
||||
|
||||
r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return nil, nil, nil
|
||||
}
|
||||
n := int(count)
|
||||
pids := make([]uint, n)
|
||||
mems := make([]uint64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
pids[i] = uint(procs[i].pid)
|
||||
mems[i] = uint64(procs[i].usedGpuMemory)
|
||||
}
|
||||
return pids, mems, errorString(r)
|
||||
}
|
||||
|
||||
func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
|
||||
cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
allPids := make(map[uint]ProcessInfo)
|
||||
|
||||
for i, pid := range cPids {
|
||||
name, err := processName(pid)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
allPids[pid] = ProcessInfo{
|
||||
PID: pid,
|
||||
Name: name,
|
||||
MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
|
||||
Type: Compute,
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for i, pid := range gPids {
|
||||
pInfo, exists := allPids[pid]
|
||||
if exists {
|
||||
pInfo.Type = ComputeAndGraphics
|
||||
allPids[pid] = pInfo
|
||||
} else {
|
||||
name, err := processName(pid)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
allPids[pid] = ProcessInfo{
|
||||
PID: pid,
|
||||
Name: name,
|
||||
MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
|
||||
Type: Graphics,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var processInfo []ProcessInfo
|
||||
for _, v := range allPids {
|
||||
processInfo = append(processInfo, v)
|
||||
}
|
||||
sort.Slice(processInfo, func(i, j int) bool {
|
||||
return processInfo[i].PID < processInfo[j].PID
|
||||
})
|
||||
|
||||
return processInfo, nil
|
||||
}
|
||||
|
||||
func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
|
||||
var clocksThrottleReasons C.ulonglong
|
||||
|
||||
r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
|
||||
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return ThrottleReasonUnknown, nil
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return ThrottleReasonUnknown, errorString(r)
|
||||
}
|
||||
|
||||
switch clocksThrottleReasons {
|
||||
case C.nvmlClocksThrottleReasonGpuIdle:
|
||||
reason = ThrottleReasonGpuIdle
|
||||
case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
|
||||
reason = ThrottleReasonApplicationsClocksSetting
|
||||
case C.nvmlClocksThrottleReasonSwPowerCap:
|
||||
reason = ThrottleReasonSwPowerCap
|
||||
case C.nvmlClocksThrottleReasonHwSlowdown:
|
||||
reason = ThrottleReasonHwSlowdown
|
||||
case C.nvmlClocksThrottleReasonSyncBoost:
|
||||
reason = ThrottleReasonSyncBoost
|
||||
case C.nvmlClocksThrottleReasonSwThermalSlowdown:
|
||||
reason = ThrottleReasonSwThermalSlowdown
|
||||
case C.nvmlClocksThrottleReasonHwThermalSlowdown:
|
||||
reason = ThrottleReasonHwThermalSlowdown
|
||||
case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
|
||||
reason = ThrottleReasonHwPowerBrakeSlowdown
|
||||
case C.nvmlClocksThrottleReasonDisplayClockSetting:
|
||||
reason = ThrottleReasonDisplayClockSetting
|
||||
case C.nvmlClocksThrottleReasonNone:
|
||||
reason = ThrottleReasonNone
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) getPerformanceState() (PerfState, error) {
|
||||
var pstate C.nvmlPstates_t
|
||||
|
||||
r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
|
||||
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return PerfStateUnknown, nil
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return PerfStateUnknown, errorString(r)
|
||||
}
|
||||
return PerfState(pstate), nil
|
||||
}
|
||||
|
||||
func processName(pid uint) (string, error) {
|
||||
f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
|
||||
d, err := ioutil.ReadFile(f)
|
||||
|
||||
if err != nil {
|
||||
// TOCTOU: process terminated
|
||||
if os.IsNotExist(err) {
|
||||
return "", nil
|
||||
}
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(d), "\n"), err
|
||||
}
|
||||
|
||||
func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
|
||||
var mode C.nvmlEnableState_t
|
||||
var buffer C.uint
|
||||
|
||||
r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return accountingInfo, errorString(r)
|
||||
}
|
||||
|
||||
r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return accountingInfo, errorString(r)
|
||||
}
|
||||
|
||||
accountingInfo = Accounting{
|
||||
Mode: ModeState(mode),
|
||||
BufferSize: uintPtr(buffer),
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) getDisplayInfo() (display Display, err error) {
|
||||
var mode, isActive C.nvmlEnableState_t
|
||||
|
||||
r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
|
||||
if r != C.NVML_SUCCESS {
|
||||
return display, errorString(r)
|
||||
}
|
||||
|
||||
r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
if r != C.NVML_SUCCESS {
|
||||
return display, errorString(r)
|
||||
}
|
||||
display = Display{
|
||||
Mode: ModeState(mode),
|
||||
Active: ModeState(isActive),
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (h handle) getPeristenceMode() (state ModeState, err error) {
|
||||
var mode C.nvmlEnableState_t
|
||||
|
||||
r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
|
||||
if r == C.NVML_ERROR_NOT_SUPPORTED {
|
||||
return
|
||||
}
|
||||
return ModeState(mode), errorString(r)
|
||||
}
|
||||
533
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
generated
vendored
Normal file
533
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
generated
vendored
Normal file
@@ -0,0 +1,533 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
package nvml
|
||||
|
||||
// #include "nvml_dl.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
|
||||
ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
|
||||
ErrUnsupportedGPU = errors.New("unsupported GPU device")
|
||||
)
|
||||
|
||||
type ModeState uint
|
||||
|
||||
const (
|
||||
Enabled ModeState = iota
|
||||
Disabled
|
||||
)
|
||||
|
||||
func (m ModeState) String() string {
|
||||
switch m {
|
||||
case Enabled:
|
||||
return "Enabled"
|
||||
case Disabled:
|
||||
return "Disabled"
|
||||
}
|
||||
return "N/A"
|
||||
}
|
||||
|
||||
type Display struct {
|
||||
Mode ModeState
|
||||
Active ModeState
|
||||
}
|
||||
|
||||
type Accounting struct {
|
||||
Mode ModeState
|
||||
BufferSize *uint
|
||||
}
|
||||
|
||||
type DeviceMode struct {
|
||||
DisplayInfo Display
|
||||
Persistence ModeState
|
||||
AccountingInfo Accounting
|
||||
}
|
||||
|
||||
type ThrottleReason uint
|
||||
|
||||
const (
|
||||
ThrottleReasonGpuIdle ThrottleReason = iota
|
||||
ThrottleReasonApplicationsClocksSetting
|
||||
ThrottleReasonSwPowerCap
|
||||
ThrottleReasonHwSlowdown
|
||||
ThrottleReasonSyncBoost
|
||||
ThrottleReasonSwThermalSlowdown
|
||||
ThrottleReasonHwThermalSlowdown
|
||||
ThrottleReasonHwPowerBrakeSlowdown
|
||||
ThrottleReasonDisplayClockSetting
|
||||
ThrottleReasonNone
|
||||
ThrottleReasonUnknown
|
||||
)
|
||||
|
||||
func (r ThrottleReason) String() string {
|
||||
switch r {
|
||||
case ThrottleReasonGpuIdle:
|
||||
return "Gpu Idle"
|
||||
case ThrottleReasonApplicationsClocksSetting:
|
||||
return "Applications Clocks Setting"
|
||||
case ThrottleReasonSwPowerCap:
|
||||
return "SW Power Cap"
|
||||
case ThrottleReasonHwSlowdown:
|
||||
return "HW Slowdown"
|
||||
case ThrottleReasonSyncBoost:
|
||||
return "Sync Boost"
|
||||
case ThrottleReasonSwThermalSlowdown:
|
||||
return "SW Thermal Slowdown"
|
||||
case ThrottleReasonHwThermalSlowdown:
|
||||
return "HW Thermal Slowdown"
|
||||
case ThrottleReasonHwPowerBrakeSlowdown:
|
||||
return "HW Power Brake Slowdown"
|
||||
case ThrottleReasonDisplayClockSetting:
|
||||
return "Display Clock Setting"
|
||||
case ThrottleReasonNone:
|
||||
return "No clocks throttling"
|
||||
}
|
||||
return "N/A"
|
||||
}
|
||||
|
||||
type PerfState uint
|
||||
|
||||
const (
|
||||
PerfStateMax = 0
|
||||
PerfStateMin = 15
|
||||
PerfStateUnknown = 32
|
||||
)
|
||||
|
||||
func (p PerfState) String() string {
|
||||
if p >= PerfStateMax && p <= PerfStateMin {
|
||||
return fmt.Sprintf("P%d", p)
|
||||
}
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
type ProcessType uint
|
||||
|
||||
const (
|
||||
Compute ProcessType = iota
|
||||
Graphics
|
||||
ComputeAndGraphics
|
||||
)
|
||||
|
||||
func (t ProcessType) String() string {
|
||||
typ := "C+G"
|
||||
if t == Compute {
|
||||
typ = "C"
|
||||
} else if t == Graphics {
|
||||
typ = "G"
|
||||
}
|
||||
return typ
|
||||
}
|
||||
|
||||
type P2PLinkType uint
|
||||
|
||||
const (
|
||||
P2PLinkUnknown P2PLinkType = iota
|
||||
P2PLinkCrossCPU
|
||||
P2PLinkSameCPU
|
||||
P2PLinkHostBridge
|
||||
P2PLinkMultiSwitch
|
||||
P2PLinkSingleSwitch
|
||||
P2PLinkSameBoard
|
||||
)
|
||||
|
||||
type P2PLink struct {
|
||||
BusID string
|
||||
Link P2PLinkType
|
||||
}
|
||||
|
||||
func (t P2PLinkType) String() string {
|
||||
switch t {
|
||||
case P2PLinkCrossCPU:
|
||||
return "Cross CPU socket"
|
||||
case P2PLinkSameCPU:
|
||||
return "Same CPU socket"
|
||||
case P2PLinkHostBridge:
|
||||
return "Host PCI bridge"
|
||||
case P2PLinkMultiSwitch:
|
||||
return "Multiple PCI switches"
|
||||
case P2PLinkSingleSwitch:
|
||||
return "Single PCI switch"
|
||||
case P2PLinkSameBoard:
|
||||
return "Same board"
|
||||
case P2PLinkUnknown:
|
||||
}
|
||||
return "N/A"
|
||||
}
|
||||
|
||||
type ClockInfo struct {
|
||||
Cores *uint
|
||||
Memory *uint
|
||||
}
|
||||
|
||||
type PCIInfo struct {
|
||||
BusID string
|
||||
BAR1 *uint64
|
||||
Bandwidth *uint
|
||||
}
|
||||
|
||||
type Device struct {
|
||||
handle
|
||||
|
||||
UUID string
|
||||
Path string
|
||||
Model *string
|
||||
Power *uint
|
||||
Memory *uint64
|
||||
CPUAffinity *uint
|
||||
PCI PCIInfo
|
||||
Clocks ClockInfo
|
||||
Topology []P2PLink
|
||||
}
|
||||
|
||||
type UtilizationInfo struct {
|
||||
GPU *uint
|
||||
Memory *uint
|
||||
Encoder *uint
|
||||
Decoder *uint
|
||||
}
|
||||
|
||||
type PCIThroughputInfo struct {
|
||||
RX *uint
|
||||
TX *uint
|
||||
}
|
||||
|
||||
type PCIStatusInfo struct {
|
||||
BAR1Used *uint64
|
||||
Throughput PCIThroughputInfo
|
||||
}
|
||||
|
||||
type ECCErrorsInfo struct {
|
||||
L1Cache *uint64
|
||||
L2Cache *uint64
|
||||
Device *uint64
|
||||
}
|
||||
|
||||
type DeviceMemory struct {
|
||||
Used *uint64
|
||||
Free *uint64
|
||||
}
|
||||
|
||||
type MemoryInfo struct {
|
||||
Global DeviceMemory
|
||||
ECCErrors ECCErrorsInfo
|
||||
}
|
||||
|
||||
type ProcessInfo struct {
|
||||
PID uint
|
||||
Name string
|
||||
MemoryUsed uint64
|
||||
Type ProcessType
|
||||
}
|
||||
|
||||
type DeviceStatus struct {
|
||||
Power *uint
|
||||
Temperature *uint
|
||||
Utilization UtilizationInfo
|
||||
Memory MemoryInfo
|
||||
Clocks ClockInfo
|
||||
PCI PCIStatusInfo
|
||||
Processes []ProcessInfo
|
||||
Throttle ThrottleReason
|
||||
Performance PerfState
|
||||
}
|
||||
|
||||
func assert(err error) {
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func Init() error {
|
||||
return init_()
|
||||
}
|
||||
|
||||
func Shutdown() error {
|
||||
return shutdown()
|
||||
}
|
||||
|
||||
func GetDeviceCount() (uint, error) {
|
||||
return deviceGetCount()
|
||||
}
|
||||
|
||||
func GetDriverVersion() (string, error) {
|
||||
return systemGetDriverVersion()
|
||||
}
|
||||
|
||||
func numaNode(busid string) (uint, error) {
|
||||
// discard leading zeros of busid
|
||||
b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
|
||||
if err != nil {
|
||||
// XXX report node 0 if NUMA support isn't enabled
|
||||
return 0, nil
|
||||
}
|
||||
node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
|
||||
}
|
||||
if node < 0 {
|
||||
node = 0 // XXX report node 0 instead of NUMA_NO_NODE
|
||||
}
|
||||
return uint(node), nil
|
||||
}
|
||||
|
||||
func pciBandwidth(gen, width *uint) *uint {
|
||||
m := map[uint]uint{
|
||||
1: 250, // MB/s
|
||||
2: 500,
|
||||
3: 985,
|
||||
4: 1969,
|
||||
}
|
||||
if gen == nil || width == nil {
|
||||
return nil
|
||||
}
|
||||
bw := m[*gen] * *width
|
||||
return &bw
|
||||
}
|
||||
|
||||
func NewDevice(idx uint) (device *Device, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
h, err := deviceGetHandleByIndex(idx)
|
||||
assert(err)
|
||||
model, err := h.deviceGetName()
|
||||
assert(err)
|
||||
uuid, err := h.deviceGetUUID()
|
||||
assert(err)
|
||||
minor, err := h.deviceGetMinorNumber()
|
||||
assert(err)
|
||||
power, err := h.deviceGetPowerManagementLimit()
|
||||
assert(err)
|
||||
totalMem, _, err := h.deviceGetMemoryInfo()
|
||||
assert(err)
|
||||
busid, err := h.deviceGetPciInfo()
|
||||
assert(err)
|
||||
bar1, _, err := h.deviceGetBAR1MemoryInfo()
|
||||
assert(err)
|
||||
pcig, err := h.deviceGetMaxPcieLinkGeneration()
|
||||
assert(err)
|
||||
pciw, err := h.deviceGetMaxPcieLinkWidth()
|
||||
assert(err)
|
||||
ccore, cmem, err := h.deviceGetMaxClockInfo()
|
||||
assert(err)
|
||||
|
||||
if minor == nil || busid == nil || uuid == nil {
|
||||
return nil, ErrUnsupportedGPU
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
||||
node, err := numaNode(*busid)
|
||||
assert(err)
|
||||
|
||||
device = &Device{
|
||||
handle: h,
|
||||
UUID: *uuid,
|
||||
Path: path,
|
||||
Model: model,
|
||||
Power: power,
|
||||
Memory: totalMem,
|
||||
CPUAffinity: &node,
|
||||
PCI: PCIInfo{
|
||||
BusID: *busid,
|
||||
BAR1: bar1,
|
||||
Bandwidth: pciBandwidth(pcig, pciw), // MB/s
|
||||
},
|
||||
Clocks: ClockInfo{
|
||||
Cores: ccore, // MHz
|
||||
Memory: cmem, // MHz
|
||||
},
|
||||
}
|
||||
if power != nil {
|
||||
*device.Power /= 1000 // W
|
||||
}
|
||||
if bar1 != nil {
|
||||
*device.PCI.BAR1 /= 1024 * 1024 // MiB
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func NewDeviceLite(idx uint) (device *Device, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
h, err := deviceGetHandleByIndex(idx)
|
||||
assert(err)
|
||||
uuid, err := h.deviceGetUUID()
|
||||
assert(err)
|
||||
minor, err := h.deviceGetMinorNumber()
|
||||
assert(err)
|
||||
busid, err := h.deviceGetPciInfo()
|
||||
assert(err)
|
||||
|
||||
if minor == nil || busid == nil || uuid == nil {
|
||||
return nil, ErrUnsupportedGPU
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
||||
|
||||
device = &Device{
|
||||
handle: h,
|
||||
UUID: *uuid,
|
||||
Path: path,
|
||||
PCI: PCIInfo{
|
||||
BusID: *busid,
|
||||
},
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (d *Device) Status() (status *DeviceStatus, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
power, err := d.deviceGetPowerUsage()
|
||||
assert(err)
|
||||
temp, err := d.deviceGetTemperature()
|
||||
assert(err)
|
||||
ugpu, umem, err := d.deviceGetUtilizationRates()
|
||||
assert(err)
|
||||
uenc, err := d.deviceGetEncoderUtilization()
|
||||
assert(err)
|
||||
udec, err := d.deviceGetDecoderUtilization()
|
||||
assert(err)
|
||||
_, devMem, err := d.deviceGetMemoryInfo()
|
||||
assert(err)
|
||||
ccore, cmem, err := d.deviceGetClockInfo()
|
||||
assert(err)
|
||||
_, bar1, err := d.deviceGetBAR1MemoryInfo()
|
||||
assert(err)
|
||||
el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
|
||||
assert(err)
|
||||
pcirx, pcitx, err := d.deviceGetPcieThroughput()
|
||||
assert(err)
|
||||
throttle, err := d.getClocksThrottleReasons()
|
||||
assert(err)
|
||||
perfState, err := d.getPerformanceState()
|
||||
assert(err)
|
||||
processInfo, err := d.deviceGetAllRunningProcesses()
|
||||
assert(err)
|
||||
|
||||
status = &DeviceStatus{
|
||||
Power: power,
|
||||
Temperature: temp, // °C
|
||||
Utilization: UtilizationInfo{
|
||||
GPU: ugpu, // %
|
||||
Memory: umem, // %
|
||||
Encoder: uenc, // %
|
||||
Decoder: udec, // %
|
||||
},
|
||||
Memory: MemoryInfo{
|
||||
Global: devMem,
|
||||
ECCErrors: ECCErrorsInfo{
|
||||
L1Cache: el1,
|
||||
L2Cache: el2,
|
||||
Device: emem,
|
||||
},
|
||||
},
|
||||
Clocks: ClockInfo{
|
||||
Cores: ccore, // MHz
|
||||
Memory: cmem, // MHz
|
||||
},
|
||||
PCI: PCIStatusInfo{
|
||||
BAR1Used: bar1,
|
||||
Throughput: PCIThroughputInfo{
|
||||
RX: pcirx,
|
||||
TX: pcitx,
|
||||
},
|
||||
},
|
||||
Throttle: throttle,
|
||||
Performance: perfState,
|
||||
Processes: processInfo,
|
||||
}
|
||||
if power != nil {
|
||||
*status.Power /= 1000 // W
|
||||
}
|
||||
if bar1 != nil {
|
||||
*status.PCI.BAR1Used /= 1024 * 1024 // MiB
|
||||
}
|
||||
if pcirx != nil {
|
||||
*status.PCI.Throughput.RX /= 1000 // MB/s
|
||||
}
|
||||
if pcitx != nil {
|
||||
*status.PCI.Throughput.TX /= 1000 // MB/s
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
|
||||
level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
|
||||
if err != nil || level == nil {
|
||||
return P2PLinkUnknown, err
|
||||
}
|
||||
|
||||
switch *level {
|
||||
case C.NVML_TOPOLOGY_INTERNAL:
|
||||
link = P2PLinkSameBoard
|
||||
case C.NVML_TOPOLOGY_SINGLE:
|
||||
link = P2PLinkSingleSwitch
|
||||
case C.NVML_TOPOLOGY_MULTIPLE:
|
||||
link = P2PLinkMultiSwitch
|
||||
case C.NVML_TOPOLOGY_HOSTBRIDGE:
|
||||
link = P2PLinkHostBridge
|
||||
case C.NVML_TOPOLOGY_CPU:
|
||||
link = P2PLinkSameCPU
|
||||
case C.NVML_TOPOLOGY_SYSTEM:
|
||||
link = P2PLinkCrossCPU
|
||||
default:
|
||||
err = ErrUnsupportedP2PLink
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
|
||||
return d.handle.deviceGetComputeRunningProcesses()
|
||||
}
|
||||
|
||||
func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
|
||||
return d.handle.deviceGetGraphicsRunningProcesses()
|
||||
}
|
||||
|
||||
func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
|
||||
return d.handle.deviceGetAllRunningProcesses()
|
||||
}
|
||||
|
||||
func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = r.(error)
|
||||
}
|
||||
}()
|
||||
|
||||
display, err := d.getDisplayInfo()
|
||||
assert(err)
|
||||
|
||||
p, err := d.getPeristenceMode()
|
||||
assert(err)
|
||||
|
||||
accounting, err := d.getAccountingInfo()
|
||||
assert(err)
|
||||
|
||||
mode = &DeviceMode{
|
||||
DisplayInfo: display,
|
||||
Persistence: p,
|
||||
AccountingInfo: accounting,
|
||||
}
|
||||
return
|
||||
}
|
||||
5871
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
generated
vendored
Normal file
5871
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
46
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
generated
vendored
Normal file
46
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
generated
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include "nvml_dl.h"
|
||||
|
||||
#define DLSYM(x, sym) \
|
||||
do { \
|
||||
dlerror(); \
|
||||
x = dlsym(handle, #sym); \
|
||||
if (dlerror() != NULL) { \
|
||||
return (NVML_ERROR_FUNCTION_NOT_FOUND); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
typedef nvmlReturn_t (*nvmlSym_t)();
|
||||
|
||||
static void *handle;
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlInit)(void)
|
||||
{
|
||||
handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
|
||||
if (handle == NULL) {
|
||||
return (NVML_ERROR_LIBRARY_NOT_FOUND);
|
||||
}
|
||||
return (nvmlInit());
|
||||
}
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlShutdown)(void)
|
||||
{
|
||||
nvmlReturn_t r = nvmlShutdown();
|
||||
if (r != NVML_SUCCESS) {
|
||||
return (r);
|
||||
}
|
||||
return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
|
||||
nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
|
||||
{
|
||||
nvmlSym_t sym;
|
||||
|
||||
DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
|
||||
return ((*sym)(dev1, dev2, info));
|
||||
}
|
||||
15
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
generated
vendored
Normal file
15
vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
generated
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
#ifndef _NVML_DL_H_
|
||||
#define _NVML_DL_H_
|
||||
|
||||
#include "nvml.h"
|
||||
|
||||
#define NVML_DL(x) x##_dl
|
||||
|
||||
extern nvmlReturn_t NVML_DL(nvmlInit)(void);
|
||||
extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
|
||||
extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
|
||||
nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
|
||||
|
||||
#endif // _NVML_DL_H_
|
||||
2
vendor/vendor.json
vendored
2
vendor/vendor.json
vendored
@@ -9,6 +9,8 @@
|
||||
{"path":"github.com/Azure/go-ansiterm/winterm","checksumSHA1":"jBimnggjIiFUjaImNoJhSVLtdzw=","revision":"fa152c58bc15761d0200cb75fe958b89a9d4888e","revisionTime":"2016-06-22T17:32:16Z"},
|
||||
{"path":"github.com/DataDog/datadog-go/statsd","checksumSHA1":"WvApwvvSe3i/3KO8300dyeFmkbI=","revision":"b10af4b12965a1ad08d164f57d14195b4140d8de","revisionTime":"2017-08-09T10:47:06Z"},
|
||||
{"path":"github.com/Microsoft/go-winio","checksumSHA1":"AzjRkOQtVBTwIw4RJLTygFhJs3s=","revision":"f533f7a102197536779ea3a8cb881d639e21ec5a","revisionTime":"2017-05-24T00:36:31Z"},
|
||||
{"path":"github.com/NVIDIA/gpu-monitoring-tools","checksumSHA1":"kF1vk+8Xvb3nGBiw9+qbUc0SZ4M=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
|
||||
{"path":"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml","checksumSHA1":"P8FATSSgpe5A17FyPrGpsX95Xw8=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
|
||||
{"path":"github.com/NYTimes/gziphandler","checksumSHA1":"jktW57+vJsziNVPeXMCoujTzdW4=","revision":"97ae7fbaf81620fe97840685304a78a306a39c64","revisionTime":"2017-09-16T00:36:49Z"},
|
||||
{"path":"github.com/Nvveen/Gotty","checksumSHA1":"Aqy8/FoAIidY/DeQ5oTYSZ4YFVc=","revision":"cd527374f1e5bff4938207604a14f2e38a9cf512","revisionTime":"2012-06-04T00:48:16Z"},
|
||||
{"path":"github.com/RackSec/srslog","checksumSHA1":"OTN4c1F0p+mEG2CpkU1Kuavupf0=","revision":"259aed10dfa74ea2961eddd1d9847619f6e98837","revisionTime":"2016-01-20T22:33:50Z"},
|
||||
|
||||
Reference in New Issue
Block a user