Merge pull request #4638 from oleksii-shyman/nvidia-plugin

WIP :: Nvidia Plugin
2026-01-08 19:35:41 +03:00 · 2018-10-04 15:24:36 -07:00
parent b8c3a1d02d a7e04f1520
commit 343e06c60f
21 changed files with 13221 additions and 1 deletions
--- a/helper/funcs.go
+++ b/helper/funcs.go
@@ -57,11 +57,16 @@ func Int64ToPtr(i int64) *int64 {
 	return &i
 }

-// UintToPtr returns the pointer to an uint
+// Uint64ToPtr returns the pointer to an uint64
 func Uint64ToPtr(u uint64) *uint64 {
 	return &u
 }

+// UintToPtr returns the pointer to an uint
+func UintToPtr(u uint) *uint {
+	return &u
+}
+
 // StringToPtr returns the pointer to a string
 func StringToPtr(str string) *string {
 	return &str
--- a/plugins/device/cmd/nvidia/README.md
+++ b/plugins/device/cmd/nvidia/README.md
@@ -0,0 +1,23 @@
+This package provides an implementation of nvidia device plugin
+
+# Behavior
+
+Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period.
+
+# Config
+
+The configuration should be passed via an HCL file that begins with a top level `config` stanza:
+
+```
+config {
+  ignored_gpu_ids = ["uuid1", "uuid2"]
+  fingerprint_period = "5s"
+  stats_period = "5s"
+}
+```
+
+The valid configuration options are:
+
+* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
+* `fingerprint_period` (`string`: `"5s"`): The interval to repeat fingerprint process to identify possible changes.
+* `stats_period` (`string`: `"5s"`): The interval at which to emit statistics about the devices.
--- a/plugins/device/cmd/nvidia/cmd/main.go
+++ b/plugins/device/cmd/nvidia/cmd/main.go
@@ -0,0 +1,18 @@
+package main
+
+import (
+	log "github.com/hashicorp/go-hclog"
+
+	"github.com/hashicorp/nomad/plugins"
+	"github.com/hashicorp/nomad/plugins/device/cmd/nvidia"
+)
+
+func main() {
+	// Serve the plugin
+	plugins.Serve(factory)
+}
+
+// factory returns a new instance of the Nvidia GPU plugin
+func factory(log log.Logger) interface{} {
+	return nvidia.NewNvidiaDevice(log)
+}
--- a/plugins/device/cmd/nvidia/device.go
+++ b/plugins/device/cmd/nvidia/device.go
@@ -0,0 +1,209 @@
+package nvidia
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"sync"
+	"time"
+
+	log "github.com/hashicorp/go-hclog"
+
+	"github.com/hashicorp/nomad/plugins/base"
+	"github.com/hashicorp/nomad/plugins/device"
+	"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+	"github.com/hashicorp/nomad/plugins/shared/hclspec"
+)
+
+const (
+	// pluginName is the name of the plugin
+	pluginName = "nvidia-gpu"
+
+	// vendor is the vendor providing the devices
+	vendor = "nvidia"
+
+	// deviceType is the type of device being returned
+	deviceType = device.DeviceTypeGPU
+
+	// notAvailable value is returned to nomad server in case some properties were
+	// undetected by nvml driver
+	notAvailable = "N/A"
+)
+
+const (
+	// Nvidia-container-runtime environment variable names
+	nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
+)
+
+var (
+	// pluginInfo describes the plugin
+	pluginInfo = &base.PluginInfoResponse{
+		Type:             base.PluginTypeDevice,
+		PluginApiVersion: "0.0.1", // XXX This should be an array and should be consts
+		PluginVersion:    "0.1.0",
+		Name:             pluginName,
+	}
+
+	// configSpec is the specification of the plugin's configuration
+	configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
+		"ignored_gpu_ids": hclspec.NewDefault(
+			hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
+			hclspec.NewLiteral("[]"),
+		),
+		"fingerprint_period": hclspec.NewDefault(
+			hclspec.NewAttr("fingerprint_period", "string", false),
+			hclspec.NewLiteral("\"5s\""),
+		),
+		"stats_period": hclspec.NewDefault(
+			hclspec.NewAttr("stats_period", "string", false),
+			hclspec.NewLiteral("\"5s\""),
+		),
+	})
+)
+
+// Config contains configuration information for the plugin.
+type Config struct {
+	IgnoredGPUIDs     []string `codec:"ignored_gpu_ids"`
+	FingerprintPeriod string   `codec:"fingerprint_period"`
+	StatsPeriod       string   `codec:"stats_period"`
+}
+
+// NvidiaDevice contains all plugin specific data
+type NvidiaDevice struct {
+	// nvmlClient is used to get data from nvidia
+	nvmlClient nvml.NvmlClient
+
+	// nvmlClientInitializationError holds an error retrieved during
+	// nvmlClient initialization
+	nvmlClientInitializationError error
+
+	// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
+	ignoredGPUIDs map[string]struct{}
+
+	// fingerprintPeriod is how often we should call nvml to get list of devices
+	fingerprintPeriod time.Duration
+
+	// statsPeriod is how often we should collect statistics for fingerprinted
+	// devices.
+	statsPeriod time.Duration
+
+	// devices is the set of detected eligible devices
+	devices    map[string]struct{}
+	deviceLock sync.RWMutex
+
+	logger log.Logger
+}
+
+// NewNvidiaDevice returns a new nvidia device plugin.
+func NewNvidiaDevice(log log.Logger) *NvidiaDevice {
+	nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient()
+	logger := log.Named(pluginName)
+	if nvmlClientInitializationError != nil {
+		logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError)
+	}
+	return &NvidiaDevice{
+		logger:                        logger,
+		devices:                       make(map[string]struct{}),
+		ignoredGPUIDs:                 make(map[string]struct{}),
+		nvmlClient:                    nvmlClient,
+		nvmlClientInitializationError: nvmlClientInitializationError,
+	}
+}
+
+// PluginInfo returns information describing the plugin.
+func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
+	return pluginInfo, nil
+}
+
+// ConfigSchema returns the plugins configuration schema.
+func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
+	return configSpec, nil
+}
+
+// SetConfig is used to set the configuration of the plugin.
+func (d *NvidiaDevice) SetConfig(data []byte) error {
+	var config Config
+	if err := base.MsgPackDecode(data, &config); err != nil {
+		return err
+	}
+
+	for _, ignoredGPUId := range config.IgnoredGPUIDs {
+		d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
+	}
+
+	period, err := time.ParseDuration(config.FingerprintPeriod)
+	if err != nil {
+		return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
+	}
+	d.fingerprintPeriod = period
+
+	// Convert the stats period
+	speriod, err := time.ParseDuration(config.StatsPeriod)
+	if err != nil {
+		return fmt.Errorf("failed to parse stats period %q: %v", config.StatsPeriod, err)
+	}
+	d.statsPeriod = speriod
+
+	return nil
+}
+
+// Fingerprint streams detected devices. If device changes are detected or the
+// devices health changes, messages will be emitted.
+func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
+	outCh := make(chan *device.FingerprintResponse)
+	go d.fingerprint(ctx, outCh)
+	return outCh, nil
+}
+
+type reservationError struct {
+	notExistingIDs []string
+}
+
+func (e *reservationError) Error() string {
+	return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
+}
+
+// Reserve returns information on how to mount given devices.
+// Assumption is made that nomad server is responsible for correctness of
+// GPU allocations, handling tricky cases such as double-allocation of single GPU
+func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
+	if len(deviceIDs) == 0 {
+		return &device.ContainerReservation{}, nil
+	}
+	// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
+	// of race condition
+	//
+	// Timeline:
+	// 	1 - fingerprint reports that GPU with id "1" is present
+	//  2 - the following events happen at the same time:
+	// 		a) server decides to allocate GPU with id "1"
+	//      b) fingerprint check reports that GPU with id "1" is no more present
+	//
+	// The latest and always valid version of fingerprinted ids are stored in
+	// d.devices map. To avoid this race condition an error is returned if
+	// any of provided deviceIDs is not found in d.devices map
+	d.deviceLock.RLock()
+	var notExistingIDs []string
+	for _, id := range deviceIDs {
+		if _, deviceIDExists := d.devices[id]; !deviceIDExists {
+			notExistingIDs = append(notExistingIDs, id)
+		}
+	}
+	d.deviceLock.RUnlock()
+	if len(notExistingIDs) != 0 {
+		return nil, &reservationError{notExistingIDs}
+	}
+
+	return &device.ContainerReservation{
+		Envs: map[string]string{
+			nvidiaVisibleDevices: strings.Join(deviceIDs, ","),
+		},
+	}, nil
+}
+
+// Stats streams statistics for the detected devices.
+func (d *NvidiaDevice) Stats(ctx context.Context) (<-chan *device.StatsResponse, error) {
+	outCh := make(chan *device.StatsResponse)
+	go d.stats(ctx, outCh)
+	return outCh, nil
+}
--- a/plugins/device/cmd/nvidia/device_test.go
+++ b/plugins/device/cmd/nvidia/device_test.go
@@ -0,0 +1,115 @@
+package nvidia
+
+import (
+	"testing"
+
+	"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+
+	hclog "github.com/hashicorp/go-hclog"
+	"github.com/hashicorp/nomad/plugins/device"
+	"github.com/stretchr/testify/require"
+)
+
+type MockNvmlClient struct {
+	FingerprintError            error
+	FingerprintResponseReturned *nvml.FingerprintData
+
+	StatsError            error
+	StatsResponseReturned []*nvml.StatsData
+}
+
+func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) {
+	return c.FingerprintResponseReturned, c.FingerprintError
+}
+
+func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
+	return c.StatsResponseReturned, c.StatsError
+}
+
+func TestReserve(t *testing.T) {
+	for _, testCase := range []struct {
+		Name                string
+		ExpectedReservation *device.ContainerReservation
+		ExpectedError       error
+		Device              *NvidiaDevice
+		RequestedIDs        []string
+	}{
+		{
+			Name:                "All RequestedIDs are not managed by Device",
+			ExpectedReservation: nil,
+			ExpectedError: &reservationError{[]string{
+				"UUID1",
+				"UUID2",
+				"UUID3",
+			}},
+			RequestedIDs: []string{
+				"UUID1",
+				"UUID2",
+				"UUID3",
+			},
+			Device: &NvidiaDevice{
+				logger: hclog.NewNullLogger(),
+			},
+		},
+		{
+			Name:                "Some RequestedIDs are not managed by Device",
+			ExpectedReservation: nil,
+			ExpectedError: &reservationError{[]string{
+				"UUID1",
+				"UUID2",
+			}},
+			RequestedIDs: []string{
+				"UUID1",
+				"UUID2",
+				"UUID3",
+			},
+			Device: &NvidiaDevice{
+				devices: map[string]struct{}{
+					"UUID3": {},
+				},
+				logger: hclog.NewNullLogger(),
+			},
+		},
+		{
+			Name: "All RequestedIDs are managed by Device",
+			ExpectedReservation: &device.ContainerReservation{
+				Envs: map[string]string{
+					nvidiaVisibleDevices: "UUID1,UUID2,UUID3",
+				},
+			},
+			ExpectedError: nil,
+			RequestedIDs: []string{
+				"UUID1",
+				"UUID2",
+				"UUID3",
+			},
+			Device: &NvidiaDevice{
+				devices: map[string]struct{}{
+					"UUID1": {},
+					"UUID2": {},
+					"UUID3": {},
+				},
+				logger: hclog.NewNullLogger(),
+			},
+		},
+		{
+			Name:                "No IDs requested",
+			ExpectedReservation: &device.ContainerReservation{},
+			ExpectedError:       nil,
+			RequestedIDs:        nil,
+			Device: &NvidiaDevice{
+				devices: map[string]struct{}{
+					"UUID1": {},
+					"UUID2": {},
+					"UUID3": {},
+				},
+				logger: hclog.NewNullLogger(),
+			},
+		},
+	} {
+		actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs)
+		req := require.New(t)
+		req.Equal(testCase.ExpectedReservation, actualReservation)
+		req.Equal(testCase.ExpectedError, actualError)
+	}
+}
--- a/plugins/device/cmd/nvidia/fingerprint.go
+++ b/plugins/device/cmd/nvidia/fingerprint.go
@@ -0,0 +1,235 @@
+package nvidia
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/hashicorp/nomad/plugins/device"
+	"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+)
+
+const (
+	// Attribute names for reporting Fingerprint output
+	MemoryMiBAttr          = "memory_mib"
+	PowerWAttr             = "power_w"
+	BAR1MiBAttr            = "bar1_mib"
+	DriverVersionAttr      = "driver_version"
+	CoresClockMHzAttr      = "cores_clock_mhz"
+	MemoryClockMHzAttr     = "memory_clock_mhz"
+	PCIBandwidthMBPerSAttr = "pci_bandwidth_mb/s"
+	DisplayStateAttr       = "display_state"
+	PersistenceModeAttr    = "persistence_mode"
+)
+
+// fingerprint is the long running goroutine that detects hardware
+func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
+	defer close(devices)
+
+	if d.nvmlClientInitializationError != nil {
+		d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError)
+		// write empty fingerprint response to let server know that there are
+		// no working Nvidia GPU units
+		devices <- device.NewFingerprint()
+		return
+	}
+
+	// Create a timer that will fire immediately for the first detection
+	ticker := time.NewTimer(0)
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			ticker.Reset(d.fingerprintPeriod)
+		}
+		d.writeFingerprintToChannel(devices)
+	}
+}
+
+// writeFingerprintToChannel makes nvml call and writes response to channel
+func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
+	fingerprintData, err := d.nvmlClient.GetFingerprintData()
+
+	if err != nil {
+		d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
+		devices <- device.NewFingerprintError(err)
+		return
+	}
+
+	// ignore devices from fingerprint output
+	fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
+	// check if any device health was updated or any device was added to host
+	if !d.fingerprintChanged(fingerprintDevices) {
+		return
+	}
+
+	commonAttributes := map[string]string{
+		DriverVersionAttr: fingerprintData.DriverVersion,
+	}
+
+	// Group all FingerprintDevices by DeviceName attribute
+	deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
+	for _, device := range fingerprintDevices {
+		deviceName := device.DeviceName
+		if deviceName == nil {
+			// nvml driver was not able to detect device name. This kind
+			// of devices are placed to single group with 'notAvailable' name
+			notAvailableCopy := notAvailable
+			deviceName = &notAvailableCopy
+		}
+
+		deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
+	}
+
+	// Build Fingerprint response with computed groups and send it over the channel
+	deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
+	for groupName, devices := range deviceListByDeviceName {
+		deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
+	}
+	devices <- device.NewFingerprint(deviceGroups...)
+}
+
+// ignoreFingerprintedDevices excludes ignored devices from fingerprint output
+func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
+	var result []*nvml.FingerprintDeviceData
+	for _, fingerprintDevice := range deviceData {
+		if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
+			result = append(result, fingerprintDevice)
+		}
+	}
+	return result
+}
+
+// fingerprintChanged checks if there are any previously unseen nvidia devices located
+// or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
+// Also, this func updates device map on NvidiaDevice with the latest data
+func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
+	d.deviceLock.Lock()
+	defer d.deviceLock.Unlock()
+
+	changeDetected := false
+	// check if every device in allDevices is in d.devices
+	for _, device := range allDevices {
+		if _, ok := d.devices[device.UUID]; !ok {
+			changeDetected = true
+		}
+	}
+
+	// check if every device in d.devices is in allDevices
+	fingerprintDeviceMap := make(map[string]struct{})
+	for _, device := range allDevices {
+		fingerprintDeviceMap[device.UUID] = struct{}{}
+	}
+	for id := range d.devices {
+		if _, ok := fingerprintDeviceMap[id]; !ok {
+			changeDetected = true
+		}
+	}
+
+	d.devices = fingerprintDeviceMap
+	return changeDetected
+}
+
+// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
+func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]string) *device.DeviceGroup {
+	// deviceGroup without devices makes no sense -> return nil when no devices are provided
+	if len(deviceList) == 0 {
+		return nil
+	}
+
+	devices := make([]*device.Device, len(deviceList))
+	for index, dev := range deviceList {
+		devices[index] = &device.Device{
+			ID: dev.UUID,
+			// all fingerprinted devices are "healthy" for now
+			// to get real health data -> dcgm bindings should be used
+			Healthy: true,
+			HwLocality: &device.DeviceLocality{
+				PciBusID: dev.PCIBusID,
+			},
+		}
+	}
+
+	deviceGroup := &device.DeviceGroup{
+		Vendor:  vendor,
+		Type:    deviceType,
+		Name:    groupName,
+		Devices: devices,
+		// Assumption made that devices with the same DeviceName have the same
+		// attributes like amount of memory, power, bar1memory etc
+		Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
+	}
+
+	// Extend attribute map with common attributes
+	for attributeKey, attributeValue := range commonAttributes {
+		deviceGroup.Attributes[attributeKey] = attributeValue
+	}
+
+	return deviceGroup
+}
+
+// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
+// struct to device.DeviceGroup.Attributes format (map[string]string)
+// this function performs all nil checks for FingerprintDeviceData pointers
+func attributesFromFingerprintDeviceData(fingerprintDeviceData *nvml.FingerprintDeviceData) map[string]string {
+	// The following fields in FingerprintDeviceData are pointers, so they can be nil
+	// In case they are nil -> return 'notAvailable' constant instead
+	var (
+		MemoryMiB          string
+		PowerW             string
+		BAR1MiB            string
+		CoresClockMHz      string
+		MemoryClockMHz     string
+		PCIBandwidthMBPerS string
+	)
+
+	if fingerprintDeviceData.MemoryMiB == nil {
+		MemoryMiB = notAvailable
+	} else {
+		MemoryMiB = fmt.Sprint(*fingerprintDeviceData.MemoryMiB)
+	}
+
+	if fingerprintDeviceData.PowerW == nil {
+		PowerW = notAvailable
+	} else {
+		PowerW = fmt.Sprint(*fingerprintDeviceData.PowerW)
+	}
+
+	if fingerprintDeviceData.BAR1MiB == nil {
+		BAR1MiB = notAvailable
+	} else {
+		BAR1MiB = fmt.Sprint(*fingerprintDeviceData.BAR1MiB)
+	}
+
+	if fingerprintDeviceData.CoresClockMHz == nil {
+		CoresClockMHz = notAvailable
+	} else {
+		CoresClockMHz = fmt.Sprint(*fingerprintDeviceData.CoresClockMHz)
+	}
+
+	if fingerprintDeviceData.MemoryClockMHz == nil {
+		MemoryClockMHz = notAvailable
+	} else {
+		MemoryClockMHz = fmt.Sprint(*fingerprintDeviceData.MemoryClockMHz)
+	}
+
+	if fingerprintDeviceData.PCIBandwidthMBPerS == nil {
+		PCIBandwidthMBPerS = notAvailable
+	} else {
+		PCIBandwidthMBPerS = fmt.Sprint(*fingerprintDeviceData.PCIBandwidthMBPerS)
+	}
+
+	return map[string]string{
+		DisplayStateAttr:       fingerprintDeviceData.DisplayState,
+		PersistenceModeAttr:    fingerprintDeviceData.PersistenceMode,
+		MemoryMiBAttr:          MemoryMiB,
+		PowerWAttr:             PowerW,
+		BAR1MiBAttr:            BAR1MiB,
+		CoresClockMHzAttr:      CoresClockMHz,
+		MemoryClockMHzAttr:     MemoryClockMHz,
+		PCIBandwidthMBPerSAttr: PCIBandwidthMBPerS,
+	}
+
+}
--- a/plugins/device/cmd/nvidia/fingerprint_test.go
+++ b/plugins/device/cmd/nvidia/fingerprint_test.go
--- a/plugins/device/cmd/nvidia/nvml/client.go
+++ b/plugins/device/cmd/nvidia/nvml/client.go
@@ -0,0 +1,194 @@
+package nvml
+
+import (
+	"fmt"
+)
+
+// DeviceData represents common fields for Nvidia device
+type DeviceData struct {
+	UUID       string
+	DeviceName *string
+	MemoryMiB  *uint64
+	PowerW     *uint
+	BAR1MiB    *uint64
+}
+
+// FingerprintDeviceData is a superset of DeviceData
+// it describes device specific fields returned from
+// nvml queries during fingerprinting call
+type FingerprintDeviceData struct {
+	*DeviceData
+	PCIBandwidthMBPerS *uint
+	CoresClockMHz      *uint
+	MemoryClockMHz     *uint
+	DisplayState       string
+	PersistenceMode    string
+	PCIBusID           string
+}
+
+// FingerprintData represets attributes of driver/devices
+type FingerprintData struct {
+	Devices       []*FingerprintDeviceData
+	DriverVersion string
+}
+
+// StatsData is a superset of DeviceData
+// it represents statistics data returned for every Nvidia device
+type StatsData struct {
+	*DeviceData
+	PowerUsageW        *uint
+	GPUUtilization     *uint
+	MemoryUtilization  *uint
+	EncoderUtilization *uint
+	DecoderUtilization *uint
+	TemperatureC       *uint
+	UsedMemoryMiB      *uint64
+	BAR1UsedMiB        *uint64
+	ECCErrorsL1Cache   *uint64
+	ECCErrorsL2Cache   *uint64
+	ECCErrorsDevice    *uint64
+}
+
+// NvmlClient describes how users would use nvml library
+type NvmlClient interface {
+	GetFingerprintData() (*FingerprintData, error)
+	GetStatsData() ([]*StatsData, error)
+}
+
+// nvmlClient implements NvmlClient
+// Users of this lib are expected to use this struct via NewNvmlClient func
+type nvmlClient struct {
+	driver NvmlDriver
+}
+
+// NewNvmlClient function creates new nvmlClient with real
+// NvmlDriver implementation. Also, this func initializes NvmlDriver
+func NewNvmlClient() (*nvmlClient, error) {
+	driver := &nvmlDriver{}
+	err := driver.Initialize()
+	if err != nil {
+		return nil, err
+	}
+	return &nvmlClient{
+		driver: driver,
+	}, nil
+}
+
+// GetFingerprintData returns FingerprintData for available Nvidia devices
+func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
+	/*
+		nvml fields to be fingerprinted # nvml_library_call
+		1  - Driver Version             # nvmlSystemGetDriverVersion
+		2  - Product Name               # nvmlDeviceGetName
+		3  - GPU UUID                   # nvmlDeviceGetUUID
+		4  - Total Memory               # nvmlDeviceGetMemoryInfo
+		5  - Power                      # nvmlDeviceGetPowerManagementLimit
+		6  - PCIBusID                   # nvmlDeviceGetPciInfo
+		7  - BAR1 Memory                # nvmlDeviceGetBAR1MemoryInfo(
+		8  - PCI Bandwidth
+		9  - Memory, Cores Clock        # nvmlDeviceGetMaxClockInfo
+		10 - Display Mode               # nvmlDeviceGetDisplayMode
+		11 - Persistence Mode           # nvmlDeviceGetPersistenceMode
+	*/
+
+	// Assumed that this method is called with receiver retrieved from
+	// NewNvmlClient
+	// because this method handles initialization of NVML library
+
+	driverVersion, err := c.driver.SystemDriverVersion()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
+	}
+
+	numDevices, err := c.driver.DeviceCount()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
+	}
+
+	allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
+
+	for i := 0; i < int(numDevices); i++ {
+		deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
+		if err != nil {
+			return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
+		}
+
+		allNvidiaGPUResources[i] = &FingerprintDeviceData{
+			DeviceData: &DeviceData{
+				DeviceName: deviceInfo.Name,
+				UUID:       deviceInfo.UUID,
+				MemoryMiB:  deviceInfo.MemoryMiB,
+				PowerW:     deviceInfo.PowerW,
+				BAR1MiB:    deviceInfo.BAR1MiB,
+			},
+			PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
+			CoresClockMHz:      deviceInfo.CoresClockMHz,
+			MemoryClockMHz:     deviceInfo.MemoryClockMHz,
+			DisplayState:       deviceInfo.DisplayState,
+			PersistenceMode:    deviceInfo.PersistenceMode,
+			PCIBusID:           deviceInfo.PCIBusID,
+		}
+	}
+	return &FingerprintData{
+		Devices:       allNvidiaGPUResources,
+		DriverVersion: driverVersion,
+	}, nil
+}
+
+// GetStatsData returns statistics data for all devices on this machine
+func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
+	/*
+	   nvml fields to be reported to stats api     # nvml_library_call
+	   1  - Used Memory                            # nvmlDeviceGetMemoryInfo
+	   2  - Utilization of GPU                     # nvmlDeviceGetUtilizationRates
+	   3  - Utilization of Memory                  # nvmlDeviceGetUtilizationRates
+	   4  - Utilization of Decoder                 # nvmlDeviceGetDecoderUtilization
+	   5  - Utilization of Encoder                 # nvmlDeviceGetEncoderUtilization
+	   6  - Current GPU Temperature                # nvmlDeviceGetTemperature
+	   7  - Power Draw                             # nvmlDeviceGetPowerUsage
+	   8  - BAR1 Used memory                       # nvmlDeviceGetBAR1MemoryInfo
+	   9  - ECC Errors on requesting L1Cache       # nvmlDeviceGetMemoryErrorCounter
+	   10 - ECC Errors on requesting L2Cache       # nvmlDeviceGetMemoryErrorCounter
+	   11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
+	*/
+
+	// Assumed that this method is called with receiver retrieved from
+	// NewNvmlClient
+	// because this method handles initialization of NVML library
+
+	numDevices, err := c.driver.DeviceCount()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
+	}
+
+	allNvidiaGPUStats := make([]*StatsData, numDevices)
+
+	for i := 0; i < int(numDevices); i++ {
+		deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
+		if err != nil {
+			return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
+		}
+
+		allNvidiaGPUStats[i] = &StatsData{
+			DeviceData: &DeviceData{
+				DeviceName: deviceInfo.Name,
+				UUID:       deviceInfo.UUID,
+				MemoryMiB:  deviceInfo.MemoryMiB,
+				PowerW:     deviceInfo.PowerW,
+				BAR1MiB:    deviceInfo.BAR1MiB,
+			},
+			PowerUsageW:        deviceStatus.PowerUsageW,
+			GPUUtilization:     deviceStatus.GPUUtilization,
+			MemoryUtilization:  deviceStatus.MemoryUtilization,
+			EncoderUtilization: deviceStatus.EncoderUtilization,
+			DecoderUtilization: deviceStatus.DecoderUtilization,
+			TemperatureC:       deviceStatus.TemperatureC,
+			UsedMemoryMiB:      deviceStatus.UsedMemoryMiB,
+			BAR1UsedMiB:        deviceStatus.BAR1UsedMiB,
+			ECCErrorsL1Cache:   deviceStatus.ECCErrorsL1Cache,
+			ECCErrorsL2Cache:   deviceStatus.ECCErrorsL2Cache,
+			ECCErrorsDevice:    deviceStatus.ECCErrorsDevice,
+		}
+	}
+	return allNvidiaGPUStats, nil
+}
--- a/plugins/device/cmd/nvidia/nvml/client_test.go
+++ b/plugins/device/cmd/nvidia/nvml/client_test.go
@@ -0,0 +1,399 @@
+package nvml
+
+import (
+	"errors"
+	"testing"
+
+	"github.com/hashicorp/nomad/helper"
+	"github.com/stretchr/testify/require"
+)
+
+type MockNVMLDriver struct {
+	systemDriverCallSuccessful               bool
+	deviceCountCallSuccessful                bool
+	deviceInfoByIndexCallSuccessful          bool
+	deviceInfoAndStatusByIndexCallSuccessful bool
+	driverVersion                            string
+	devices                                  []*DeviceInfo
+	deviceStatus                             []*DeviceStatus
+}
+
+func (m *MockNVMLDriver) Initialize() error {
+	return nil
+}
+
+func (m *MockNVMLDriver) Shutdown() error {
+	return nil
+}
+
+func (m *MockNVMLDriver) SystemDriverVersion() (string, error) {
+	if !m.systemDriverCallSuccessful {
+		return "", errors.New("failed to get system driver")
+	}
+	return m.driverVersion, nil
+}
+
+func (m *MockNVMLDriver) DeviceCount() (uint, error) {
+	if !m.deviceCountCallSuccessful {
+		return 0, errors.New("failed to get device length")
+	}
+	return uint(len(m.devices)), nil
+}
+
+func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
+	if index >= uint(len(m.devices)) {
+		return nil, errors.New("index is out of range")
+	}
+	if !m.deviceInfoByIndexCallSuccessful {
+		return nil, errors.New("failed to get device info by index")
+	}
+	return m.devices[index], nil
+}
+
+func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
+	if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) {
+		return nil, nil, errors.New("index is out of range")
+	}
+	if !m.deviceInfoAndStatusByIndexCallSuccessful {
+		return nil, nil, errors.New("failed to get device info and status by index")
+	}
+	return m.devices[index], m.deviceStatus[index], nil
+}
+
+func TestGetFingerprintDataFromNVML(t *testing.T) {
+	for _, testCase := range []struct {
+		Name                string
+		DriverConfiguration *MockNVMLDriver
+		ExpectedError       bool
+		ExpectedResult      *FingerprintData
+	}{
+		{
+			Name:           "fail on systemDriverCallSuccessful",
+			ExpectedError:  true,
+			ExpectedResult: nil,
+			DriverConfiguration: &MockNVMLDriver{
+				systemDriverCallSuccessful:      false,
+				deviceCountCallSuccessful:       true,
+				deviceInfoByIndexCallSuccessful: true,
+			},
+		},
+		{
+			Name:           "fail on deviceCountCallSuccessful",
+			ExpectedError:  true,
+			ExpectedResult: nil,
+			DriverConfiguration: &MockNVMLDriver{
+				systemDriverCallSuccessful:      true,
+				deviceCountCallSuccessful:       false,
+				deviceInfoByIndexCallSuccessful: true,
+			},
+		},
+		{
+			Name:           "fail on deviceInfoByIndexCall",
+			ExpectedError:  true,
+			ExpectedResult: nil,
+			DriverConfiguration: &MockNVMLDriver{
+				systemDriverCallSuccessful:      true,
+				deviceCountCallSuccessful:       true,
+				deviceInfoByIndexCallSuccessful: false,
+				devices: []*DeviceInfo{
+					{
+						UUID:               "UUID1",
+						Name:               helper.StringToPtr("ModelName1"),
+						MemoryMiB:          helper.Uint64ToPtr(16),
+						PCIBusID:           "busId",
+						PowerW:             helper.UintToPtr(100),
+						BAR1MiB:            helper.Uint64ToPtr(100),
+						PCIBandwidthMBPerS: helper.UintToPtr(100),
+						CoresClockMHz:      helper.UintToPtr(100),
+						MemoryClockMHz:     helper.UintToPtr(100),
+					}, {
+						UUID:               "UUID2",
+						Name:               helper.StringToPtr("ModelName2"),
+						MemoryMiB:          helper.Uint64ToPtr(8),
+						PCIBusID:           "busId",
+						PowerW:             helper.UintToPtr(100),
+						BAR1MiB:            helper.Uint64ToPtr(100),
+						PCIBandwidthMBPerS: helper.UintToPtr(100),
+						CoresClockMHz:      helper.UintToPtr(100),
+						MemoryClockMHz:     helper.UintToPtr(100),
+					},
+				},
+			},
+		},
+		{
+			Name:          "successful outcome",
+			ExpectedError: false,
+			ExpectedResult: &FingerprintData{
+				DriverVersion: "driverVersion",
+				Devices: []*FingerprintDeviceData{
+					{
+						DeviceData: &DeviceData{
+							DeviceName: helper.StringToPtr("ModelName1"),
+							UUID:       "UUID1",
+							MemoryMiB:  helper.Uint64ToPtr(16),
+							PowerW:     helper.UintToPtr(100),
+							BAR1MiB:    helper.Uint64ToPtr(100),
+						},
+						PCIBusID:           "busId1",
+						PCIBandwidthMBPerS: helper.UintToPtr(100),
+						CoresClockMHz:      helper.UintToPtr(100),
+						MemoryClockMHz:     helper.UintToPtr(100),
+						DisplayState:       "Enabled",
+						PersistenceMode:    "Enabled",
+					}, {
+						DeviceData: &DeviceData{
+							DeviceName: helper.StringToPtr("ModelName2"),
+							UUID:       "UUID2",
+							MemoryMiB:  helper.Uint64ToPtr(8),
+							PowerW:     helper.UintToPtr(200),
+							BAR1MiB:    helper.Uint64ToPtr(200),
+						},
+						PCIBusID:           "busId2",
+						PCIBandwidthMBPerS: helper.UintToPtr(200),
+						CoresClockMHz:      helper.UintToPtr(200),
+						MemoryClockMHz:     helper.UintToPtr(200),
+						DisplayState:       "Enabled",
+						PersistenceMode:    "Enabled",
+					},
+				},
+			},
+			DriverConfiguration: &MockNVMLDriver{
+				systemDriverCallSuccessful:      true,
+				deviceCountCallSuccessful:       true,
+				deviceInfoByIndexCallSuccessful: true,
+				driverVersion:                   "driverVersion",
+				devices: []*DeviceInfo{
+					{
+						UUID:               "UUID1",
+						Name:               helper.StringToPtr("ModelName1"),
+						MemoryMiB:          helper.Uint64ToPtr(16),
+						PCIBusID:           "busId1",
+						PowerW:             helper.UintToPtr(100),
+						BAR1MiB:            helper.Uint64ToPtr(100),
+						PCIBandwidthMBPerS: helper.UintToPtr(100),
+						CoresClockMHz:      helper.UintToPtr(100),
+						MemoryClockMHz:     helper.UintToPtr(100),
+						DisplayState:       "Enabled",
+						PersistenceMode:    "Enabled",
+					}, {
+						UUID:               "UUID2",
+						Name:               helper.StringToPtr("ModelName2"),
+						MemoryMiB:          helper.Uint64ToPtr(8),
+						PCIBusID:           "busId2",
+						PowerW:             helper.UintToPtr(200),
+						BAR1MiB:            helper.Uint64ToPtr(200),
+						PCIBandwidthMBPerS: helper.UintToPtr(200),
+						CoresClockMHz:      helper.UintToPtr(200),
+						MemoryClockMHz:     helper.UintToPtr(200),
+						DisplayState:       "Enabled",
+						PersistenceMode:    "Enabled",
+					},
+				},
+			},
+		},
+	} {
+		cli := nvmlClient{driver: testCase.DriverConfiguration}
+		fingerprintData, err := cli.GetFingerprintData()
+		if testCase.ExpectedError && err == nil {
+			t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
+		}
+		if !testCase.ExpectedError && err != nil {
+			t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
+		}
+		require.New(t).Equal(testCase.ExpectedResult, fingerprintData)
+	}
+}
+
+func TestGetStatsDataFromNVML(t *testing.T) {
+	for _, testCase := range []struct {
+		Name                string
+		DriverConfiguration *MockNVMLDriver
+		ExpectedError       bool
+		ExpectedResult      []*StatsData
+	}{
+		{
+			Name:           "fail on deviceCountCallSuccessful",
+			ExpectedError:  true,
+			ExpectedResult: nil,
+			DriverConfiguration: &MockNVMLDriver{
+				systemDriverCallSuccessful:               true,
+				deviceCountCallSuccessful:                false,
+				deviceInfoByIndexCallSuccessful:          true,
+				deviceInfoAndStatusByIndexCallSuccessful: true,
+			},
+		},
+		{
+			Name:           "fail on DeviceInfoAndStatusByIndex call",
+			ExpectedError:  true,
+			ExpectedResult: nil,
+			DriverConfiguration: &MockNVMLDriver{
+				systemDriverCallSuccessful:               true,
+				deviceCountCallSuccessful:                true,
+				deviceInfoAndStatusByIndexCallSuccessful: false,
+				devices: []*DeviceInfo{
+					{
+						UUID:               "UUID1",
+						Name:               helper.StringToPtr("ModelName1"),
+						MemoryMiB:          helper.Uint64ToPtr(16),
+						PCIBusID:           "busId1",
+						PowerW:             helper.UintToPtr(100),
+						BAR1MiB:            helper.Uint64ToPtr(100),
+						PCIBandwidthMBPerS: helper.UintToPtr(100),
+						CoresClockMHz:      helper.UintToPtr(100),
+						MemoryClockMHz:     helper.UintToPtr(100),
+					}, {
+						UUID:               "UUID2",
+						Name:               helper.StringToPtr("ModelName2"),
+						MemoryMiB:          helper.Uint64ToPtr(8),
+						PCIBusID:           "busId2",
+						PowerW:             helper.UintToPtr(200),
+						BAR1MiB:            helper.Uint64ToPtr(200),
+						PCIBandwidthMBPerS: helper.UintToPtr(200),
+						CoresClockMHz:      helper.UintToPtr(200),
+						MemoryClockMHz:     helper.UintToPtr(200),
+					},
+				},
+				deviceStatus: []*DeviceStatus{
+					{
+						TemperatureC:       helper.UintToPtr(1),
+						GPUUtilization:     helper.UintToPtr(1),
+						MemoryUtilization:  helper.UintToPtr(1),
+						EncoderUtilization: helper.UintToPtr(1),
+						DecoderUtilization: helper.UintToPtr(1),
+						UsedMemoryMiB:      helper.Uint64ToPtr(1),
+						ECCErrorsL1Cache:   helper.Uint64ToPtr(1),
+						ECCErrorsL2Cache:   helper.Uint64ToPtr(1),
+						ECCErrorsDevice:    helper.Uint64ToPtr(1),
+						PowerUsageW:        helper.UintToPtr(1),
+						BAR1UsedMiB:        helper.Uint64ToPtr(1),
+					},
+					{
+						TemperatureC:       helper.UintToPtr(2),
+						GPUUtilization:     helper.UintToPtr(2),
+						MemoryUtilization:  helper.UintToPtr(2),
+						EncoderUtilization: helper.UintToPtr(2),
+						DecoderUtilization: helper.UintToPtr(2),
+						UsedMemoryMiB:      helper.Uint64ToPtr(2),
+						ECCErrorsL1Cache:   helper.Uint64ToPtr(2),
+						ECCErrorsL2Cache:   helper.Uint64ToPtr(2),
+						ECCErrorsDevice:    helper.Uint64ToPtr(2),
+						PowerUsageW:        helper.UintToPtr(2),
+						BAR1UsedMiB:        helper.Uint64ToPtr(2),
+					},
+				},
+			},
+		},
+		{
+			Name:          "successful outcome",
+			ExpectedError: false,
+			ExpectedResult: []*StatsData{
+				{
+					DeviceData: &DeviceData{
+						DeviceName: helper.StringToPtr("ModelName1"),
+						UUID:       "UUID1",
+						MemoryMiB:  helper.Uint64ToPtr(16),
+						PowerW:     helper.UintToPtr(100),
+						BAR1MiB:    helper.Uint64ToPtr(100),
+					},
+					TemperatureC:       helper.UintToPtr(1),
+					GPUUtilization:     helper.UintToPtr(1),
+					MemoryUtilization:  helper.UintToPtr(1),
+					EncoderUtilization: helper.UintToPtr(1),
+					DecoderUtilization: helper.UintToPtr(1),
+					UsedMemoryMiB:      helper.Uint64ToPtr(1),
+					ECCErrorsL1Cache:   helper.Uint64ToPtr(1),
+					ECCErrorsL2Cache:   helper.Uint64ToPtr(1),
+					ECCErrorsDevice:    helper.Uint64ToPtr(1),
+					PowerUsageW:        helper.UintToPtr(1),
+					BAR1UsedMiB:        helper.Uint64ToPtr(1),
+				},
+				{
+					DeviceData: &DeviceData{
+						DeviceName: helper.StringToPtr("ModelName2"),
+						UUID:       "UUID2",
+						MemoryMiB:  helper.Uint64ToPtr(8),
+						PowerW:     helper.UintToPtr(200),
+						BAR1MiB:    helper.Uint64ToPtr(200),
+					},
+					TemperatureC:       helper.UintToPtr(2),
+					GPUUtilization:     helper.UintToPtr(2),
+					MemoryUtilization:  helper.UintToPtr(2),
+					EncoderUtilization: helper.UintToPtr(2),
+					DecoderUtilization: helper.UintToPtr(2),
+					UsedMemoryMiB:      helper.Uint64ToPtr(2),
+					ECCErrorsL1Cache:   helper.Uint64ToPtr(2),
+					ECCErrorsL2Cache:   helper.Uint64ToPtr(2),
+					ECCErrorsDevice:    helper.Uint64ToPtr(2),
+					PowerUsageW:        helper.UintToPtr(2),
+					BAR1UsedMiB:        helper.Uint64ToPtr(2),
+				},
+			},
+			DriverConfiguration: &MockNVMLDriver{
+				deviceCountCallSuccessful:                true,
+				deviceInfoByIndexCallSuccessful:          true,
+				deviceInfoAndStatusByIndexCallSuccessful: true,
+				devices: []*DeviceInfo{
+					{
+						UUID:               "UUID1",
+						Name:               helper.StringToPtr("ModelName1"),
+						MemoryMiB:          helper.Uint64ToPtr(16),
+						PCIBusID:           "busId1",
+						PowerW:             helper.UintToPtr(100),
+						BAR1MiB:            helper.Uint64ToPtr(100),
+						PCIBandwidthMBPerS: helper.UintToPtr(100),
+						CoresClockMHz:      helper.UintToPtr(100),
+						MemoryClockMHz:     helper.UintToPtr(100),
+					}, {
+						UUID:               "UUID2",
+						Name:               helper.StringToPtr("ModelName2"),
+						MemoryMiB:          helper.Uint64ToPtr(8),
+						PCIBusID:           "busId2",
+						PowerW:             helper.UintToPtr(200),
+						BAR1MiB:            helper.Uint64ToPtr(200),
+						PCIBandwidthMBPerS: helper.UintToPtr(200),
+						CoresClockMHz:      helper.UintToPtr(200),
+						MemoryClockMHz:     helper.UintToPtr(200),
+					},
+				},
+				deviceStatus: []*DeviceStatus{
+					{
+						TemperatureC:       helper.UintToPtr(1),
+						GPUUtilization:     helper.UintToPtr(1),
+						MemoryUtilization:  helper.UintToPtr(1),
+						EncoderUtilization: helper.UintToPtr(1),
+						DecoderUtilization: helper.UintToPtr(1),
+						UsedMemoryMiB:      helper.Uint64ToPtr(1),
+						ECCErrorsL1Cache:   helper.Uint64ToPtr(1),
+						ECCErrorsL2Cache:   helper.Uint64ToPtr(1),
+						ECCErrorsDevice:    helper.Uint64ToPtr(1),
+						PowerUsageW:        helper.UintToPtr(1),
+						BAR1UsedMiB:        helper.Uint64ToPtr(1),
+					},
+					{
+						TemperatureC:       helper.UintToPtr(2),
+						GPUUtilization:     helper.UintToPtr(2),
+						MemoryUtilization:  helper.UintToPtr(2),
+						EncoderUtilization: helper.UintToPtr(2),
+						DecoderUtilization: helper.UintToPtr(2),
+						UsedMemoryMiB:      helper.Uint64ToPtr(2),
+						ECCErrorsL1Cache:   helper.Uint64ToPtr(2),
+						ECCErrorsL2Cache:   helper.Uint64ToPtr(2),
+						ECCErrorsDevice:    helper.Uint64ToPtr(2),
+						PowerUsageW:        helper.UintToPtr(2),
+						BAR1UsedMiB:        helper.Uint64ToPtr(2),
+					},
+				},
+			},
+		},
+	} {
+		cli := nvmlClient{driver: testCase.DriverConfiguration}
+		statsData, err := cli.GetStatsData()
+		if testCase.ExpectedError && err == nil {
+			t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
+		}
+		if !testCase.ExpectedError && err != nil {
+			t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
+		}
+		require.New(t).Equal(testCase.ExpectedResult, statsData)
+	}
+}
--- a/plugins/device/cmd/nvidia/nvml/driver.go
+++ b/plugins/device/cmd/nvidia/nvml/driver.go
@@ -0,0 +1,138 @@
+package nvml
+
+import (
+	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
+)
+
+// DeviceInfo represents nvml device data
+// this struct is returned by NvmlDriver DeviceInfoByIndex and
+// DeviceInfoAndStatusByIndex methods
+type DeviceInfo struct {
+	// The following fields are guaranteed to be retrieved from nvml
+	UUID            string
+	PCIBusID        string
+	DisplayState    string
+	PersistenceMode string
+
+	// The following fields can be nil after call to nvml, because nvml was
+	// not able to retrieve this fields for specific nvidia card
+	Name               *string
+	MemoryMiB          *uint64
+	PowerW             *uint
+	BAR1MiB            *uint64
+	PCIBandwidthMBPerS *uint
+	CoresClockMHz      *uint
+	MemoryClockMHz     *uint
+}
+
+// DeviceStatus represents nvml device status
+// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
+type DeviceStatus struct {
+	// The following fields can be nil after call to nvml, because nvml was
+	// not able to retrieve this fields for specific nvidia card
+	PowerUsageW        *uint
+	TemperatureC       *uint
+	GPUUtilization     *uint // %
+	MemoryUtilization  *uint // %
+	EncoderUtilization *uint // %
+	DecoderUtilization *uint // %
+	BAR1UsedMiB        *uint64
+	UsedMemoryMiB      *uint64
+	ECCErrorsL1Cache   *uint64
+	ECCErrorsL2Cache   *uint64
+	ECCErrorsDevice    *uint64
+}
+
+// NvmlDriver represents set of methods to query nvml library
+type NvmlDriver interface {
+	Initialize() error
+	Shutdown() error
+	SystemDriverVersion() (string, error)
+	DeviceCount() (uint, error)
+	DeviceInfoByIndex(uint) (*DeviceInfo, error)
+	DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
+}
+
+// nvmlDriver implements NvmlDriver
+// Users are required to call Initialize method before using any other methods
+type nvmlDriver struct{}
+
+// Initialize nvml library by locating nvml shared object file and calling ldopen
+func (n *nvmlDriver) Initialize() error {
+	return nvml.Init()
+}
+
+// Shutdown stops any further interaction with nvml
+func (n *nvmlDriver) Shutdown() error {
+	return nvml.Shutdown()
+}
+
+// SystemDriverVersion returns installed driver version
+func (n *nvmlDriver) SystemDriverVersion() (string, error) {
+	return nvml.GetDriverVersion()
+}
+
+// DeviceCount reports number of available GPU devices
+func (n *nvmlDriver) DeviceCount() (uint, error) {
+	return nvml.GetDeviceCount()
+}
+
+// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
+func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
+	device, err := nvml.NewDevice(index)
+	if err != nil {
+		return nil, err
+	}
+	deviceMode, err := device.GetDeviceMode()
+	if err != nil {
+		return nil, err
+	}
+	return &DeviceInfo{
+		UUID:               device.UUID,
+		Name:               device.Model,
+		MemoryMiB:          device.Memory,
+		PowerW:             device.Power,
+		BAR1MiB:            device.PCI.BAR1,
+		PCIBandwidthMBPerS: device.PCI.Bandwidth,
+		PCIBusID:           device.PCI.BusID,
+		CoresClockMHz:      device.Clocks.Cores,
+		MemoryClockMHz:     device.Clocks.Memory,
+		DisplayState:       deviceMode.DisplayInfo.Mode.String(),
+		PersistenceMode:    deviceMode.Persistence.String(),
+	}, nil
+}
+
+// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
+func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
+	device, err := nvml.NewDevice(index)
+	if err != nil {
+		return nil, nil, err
+	}
+	status, err := device.Status()
+	if err != nil {
+		return nil, nil, err
+	}
+	return &DeviceInfo{
+			UUID:               device.UUID,
+			Name:               device.Model,
+			MemoryMiB:          device.Memory,
+			PowerW:             device.Power,
+			BAR1MiB:            device.PCI.BAR1,
+			PCIBandwidthMBPerS: device.PCI.Bandwidth,
+			PCIBusID:           device.PCI.BusID,
+			CoresClockMHz:      device.Clocks.Cores,
+			MemoryClockMHz:     device.Clocks.Memory,
+		}, &DeviceStatus{
+			TemperatureC:       status.Temperature,
+			GPUUtilization:     status.Utilization.GPU,
+			MemoryUtilization:  status.Utilization.Memory,
+			EncoderUtilization: status.Utilization.Encoder,
+			DecoderUtilization: status.Utilization.Decoder,
+			UsedMemoryMiB:      status.Memory.Global.Used,
+			ECCErrorsL1Cache:   status.Memory.ECCErrors.L1Cache,
+			ECCErrorsL2Cache:   status.Memory.ECCErrors.L2Cache,
+			ECCErrorsDevice:    status.Memory.ECCErrors.Device,
+			PowerUsageW:        status.Power,
+			BAR1UsedMiB:        status.PCI.BAR1Used,
+		}, nil
+}
--- a/plugins/device/cmd/nvidia/stats.go
+++ b/plugins/device/cmd/nvidia/stats.go
@@ -0,0 +1,301 @@
+package nvidia
+
+import (
+	"context"
+	"time"
+
+	"github.com/hashicorp/nomad/plugins/device"
+	"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
+)
+
+const (
+	// Attribute names for reporting stats output
+	PowerUsageAttr = "Power usage"
+	PowerUsageUnit = "W"
+	PowerUsageDesc = "Power usage for this GPU in watts and " +
+		"its associated circuitry (e.g. memory) / Maximum GPU Power"
+	GPUUtilizationAttr = "GPU utilization"
+	GPUUtilizationUnit = "%"
+	GPUUtilizationDesc = "Percent of time over the past sample period " +
+		"during which one or more kernels were executing on the GPU."
+	MemoryUtilizationAttr  = "Memory utilization"
+	MemoryUtilizationUnit  = "%"
+	MemoryUtilizationDesc  = "Percentage of bandwidth used during the past sample period"
+	EncoderUtilizationAttr = "Encoder utilization"
+	EncoderUtilizationUnit = "%"
+	EncoderUtilizationDesc = "Percent of time over the past sample period " +
+		"during which GPU Encoder was used"
+	DecoderUtilizationAttr = "Decoder utilization"
+	DecoderUtilizationUnit = "%"
+	DecoderUtilizationDesc = "Percent of time over the past sample period " +
+		"during which GPU Decoder was used"
+	TemperatureAttr      = "Temperature"
+	TemperatureUnit      = "C" // Celsius degrees
+	TemperatureDesc      = "Temperature of the Unit"
+	MemoryStateAttr      = "Memory state"
+	MemoryStateUnit      = "MiB" // Mebibytes
+	MemoryStateDesc      = "UsedMemory / TotalMemory"
+	BAR1StateAttr        = "BAR1 buffer state"
+	BAR1StateUnit        = "MiB" // Mebibytes
+	BAR1StateDesc        = "UsedBAR1 / TotalBAR1"
+	ECCErrorsL1CacheAttr = "ECC L1 errors"
+	ECCErrorsL1CacheUnit = "#" // number of errors
+	ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
+	ECCErrorsL2CacheAttr = "ECC L2 errors"
+	ECCErrorsL2CacheUnit = "#" // number of errors
+	ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
+	ECCErrorsDeviceAttr  = "ECC memory errors"
+	ECCErrorsDeviceUnit  = "#" // number of errors
+	ECCErrorsDeviceDesc  = "Requested memory error counter for the device"
+)
+
+// stats is the long running goroutine that streams device statistics
+func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) {
+	defer close(stats)
+
+	if d.nvmlClientInitializationError != nil {
+		d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError)
+		return
+	}
+
+	// Create a timer that will fire immediately for the first detection
+	ticker := time.NewTimer(0)
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			ticker.Reset(d.statsPeriod)
+		}
+
+		d.writeStatsToChannel(stats, time.Now())
+	}
+}
+
+// filterStatsByID accepts list of StatsData and set of IDs
+// this function would return entries from StatsData with IDs found in the set
+func filterStatsByID(stats []*nvml.StatsData, IDs map[string]struct{}) []*nvml.StatsData {
+	var filteredStats []*nvml.StatsData
+	for _, statsItem := range stats {
+		if _, ok := IDs[statsItem.UUID]; ok {
+			filteredStats = append(filteredStats, statsItem)
+		}
+	}
+	return filteredStats
+}
+
+// writeStatsToChannel collects StatsData from NVML backend, groups StatsData
+// by DeviceName attribute, populates DeviceGroupStats structure for every group
+// and sends data over provided channel
+func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
+	statsData, err := d.nvmlClient.GetStatsData()
+	if err != nil {
+		d.logger.Error("failed to get nvidia stats", "error", err)
+		stats <- &device.StatsResponse{
+			Error: err,
+		}
+		return
+	}
+
+	// filter only stats from devices that are stored in NvidiaDevice struct
+	d.deviceLock.RLock()
+	statsData = filterStatsByID(statsData, d.devices)
+	d.deviceLock.RUnlock()
+
+	// group stats by DeviceName struct field
+	statsListByDeviceName := make(map[string][]*nvml.StatsData)
+	for _, statsItem := range statsData {
+		deviceName := statsItem.DeviceName
+		if deviceName == nil {
+			// nvml driver was not able to detect device name. This kind
+			// of devices are placed to single group with 'notAvailable' name
+			notAvailableCopy := notAvailable
+			deviceName = &notAvailableCopy
+		}
+
+		statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
+	}
+
+	// place data device.DeviceGroupStats struct for every group of stats
+	deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
+	for groupName, groupStats := range statsListByDeviceName {
+		deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
+	}
+
+	stats <- &device.StatsResponse{
+		Groups: deviceGroupsStats,
+	}
+}
+
+func newNotAvailableDeviceStats(unit, desc string) *device.StatValue {
+	return &device.StatValue{Unit: unit, Desc: desc, StringVal: notAvailable}
+}
+
+// statsForGroup is a helper function that populates device.DeviceGroupStats
+// for given groupName with groupStats list
+func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
+	instanceStats := make(map[string]*device.DeviceStats)
+	for _, statsItem := range groupStats {
+		instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
+	}
+
+	return &device.DeviceGroupStats{
+		Vendor:        vendor,
+		Type:          deviceType,
+		Name:          groupName,
+		InstanceStats: instanceStats,
+	}
+}
+
+// statsForItem is a helper function that populates device.DeviceStats for given
+// nvml.StatsData
+func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
+	// nvml.StatsData holds pointers to values that can be nil
+	// In case they are nil return stats with 'notAvailable' constant
+	var (
+		powerUsageStat         *device.StatValue
+		GPUUtilizationStat     *device.StatValue
+		memoryUtilizationStat  *device.StatValue
+		encoderUtilizationStat *device.StatValue
+		decoderUtilizationStat *device.StatValue
+		temperatureStat        *device.StatValue
+		memoryStateStat        *device.StatValue
+		BAR1StateStat          *device.StatValue
+		ECCErrorsL1CacheStat   *device.StatValue
+		ECCErrorsL2CacheStat   *device.StatValue
+		ECCErrorsDeviceStat    *device.StatValue
+	)
+
+	if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
+		powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
+	} else {
+		powerUsageStat = &device.StatValue{
+			Unit:              PowerUsageUnit,
+			Desc:              PowerUsageDesc,
+			IntNumeratorVal:   int64(*statsItem.PowerUsageW),
+			IntDenominatorVal: int64(*statsItem.PowerW),
+		}
+	}
+
+	if statsItem.GPUUtilization == nil {
+		GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
+	} else {
+		GPUUtilizationStat = &device.StatValue{
+			Unit:            GPUUtilizationUnit,
+			Desc:            GPUUtilizationDesc,
+			IntNumeratorVal: int64(*statsItem.GPUUtilization),
+		}
+	}
+
+	if statsItem.MemoryUtilization == nil {
+		memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
+	} else {
+		memoryUtilizationStat = &device.StatValue{
+			Unit:            MemoryUtilizationUnit,
+			Desc:            MemoryUtilizationDesc,
+			IntNumeratorVal: int64(*statsItem.MemoryUtilization),
+		}
+	}
+
+	if statsItem.EncoderUtilization == nil {
+		encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
+	} else {
+		encoderUtilizationStat = &device.StatValue{
+			Unit:            EncoderUtilizationUnit,
+			Desc:            EncoderUtilizationDesc,
+			IntNumeratorVal: int64(*statsItem.EncoderUtilization),
+		}
+	}
+
+	if statsItem.DecoderUtilization == nil {
+		decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
+	} else {
+		decoderUtilizationStat = &device.StatValue{
+			Unit:            DecoderUtilizationUnit,
+			Desc:            DecoderUtilizationDesc,
+			IntNumeratorVal: int64(*statsItem.DecoderUtilization),
+		}
+	}
+
+	if statsItem.TemperatureC == nil {
+		temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
+	} else {
+		temperatureStat = &device.StatValue{
+			Unit:            TemperatureUnit,
+			Desc:            TemperatureDesc,
+			IntNumeratorVal: int64(*statsItem.TemperatureC),
+		}
+	}
+
+	if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
+		memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
+	} else {
+		memoryStateStat = &device.StatValue{
+			Unit:              MemoryStateUnit,
+			Desc:              MemoryStateDesc,
+			IntNumeratorVal:   int64(*statsItem.UsedMemoryMiB),
+			IntDenominatorVal: int64(*statsItem.MemoryMiB),
+		}
+	}
+
+	if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
+		BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
+	} else {
+		BAR1StateStat = &device.StatValue{
+			Unit:              BAR1StateUnit,
+			Desc:              BAR1StateDesc,
+			IntNumeratorVal:   int64(*statsItem.BAR1UsedMiB),
+			IntDenominatorVal: int64(*statsItem.BAR1MiB),
+		}
+	}
+
+	if statsItem.ECCErrorsL1Cache == nil {
+		ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
+	} else {
+		ECCErrorsL1CacheStat = &device.StatValue{
+			Unit:            ECCErrorsL1CacheUnit,
+			Desc:            ECCErrorsL1CacheDesc,
+			IntNumeratorVal: int64(*statsItem.ECCErrorsL1Cache),
+		}
+	}
+
+	if statsItem.ECCErrorsL2Cache == nil {
+		ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
+	} else {
+		ECCErrorsL2CacheStat = &device.StatValue{
+			Unit:            ECCErrorsL2CacheUnit,
+			Desc:            ECCErrorsL2CacheDesc,
+			IntNumeratorVal: int64(*statsItem.ECCErrorsL2Cache),
+		}
+	}
+
+	if statsItem.ECCErrorsDevice == nil {
+		ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
+	} else {
+		ECCErrorsDeviceStat = &device.StatValue{
+			Unit:            ECCErrorsDeviceUnit,
+			Desc:            ECCErrorsDeviceDesc,
+			IntNumeratorVal: int64(*statsItem.ECCErrorsDevice),
+		}
+	}
+	return &device.DeviceStats{
+		Summary: temperatureStat,
+		Stats: &device.StatObject{
+			Attributes: map[string]*device.StatValue{
+				PowerUsageAttr:         powerUsageStat,
+				GPUUtilizationAttr:     GPUUtilizationStat,
+				MemoryUtilizationAttr:  memoryUtilizationStat,
+				EncoderUtilizationAttr: encoderUtilizationStat,
+				DecoderUtilizationAttr: decoderUtilizationStat,
+				TemperatureAttr:        temperatureStat,
+				MemoryStateAttr:        memoryStateStat,
+				BAR1StateAttr:          BAR1StateStat,
+				ECCErrorsL1CacheAttr:   ECCErrorsL1CacheStat,
+				ECCErrorsL2CacheAttr:   ECCErrorsL2CacheStat,
+				ECCErrorsDeviceAttr:    ECCErrorsDeviceStat,
+			},
+		},
+		Timestamp: timestamp,
+	}
+}
--- a/plugins/device/cmd/nvidia/stats_test.go
+++ b/plugins/device/cmd/nvidia/stats_test.go
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/CLA
@@ -0,0 +1,160 @@
+                            GPU Monitoring Tools
+   Software Grant and Corporate Contributor License Agreement ("Agreement")
+
+   Thank you for your interest in the gpu-monitoring-tools Project (the
+   "Project"). In order to clarify the intellectual property license
+   granted with Contributions from any person or entity, NVIDIA
+   Corporation (the “Copyright Holders") must have a Contributor License
+   Agreement (CLA) on file that has been signed by each Contributor,
+   indicating agreement to the license terms below. This license is
+   for your protection as a Contributor as well as the protection of the
+   Project and its users; it does not change your rights to use your own
+   Contributions for any other purpose.
+
+   This version of the Agreement allows an entity (the "Corporation") to
+   submit Contributions to the Project, to authorize Contributions
+   submitted by its designated employees to the Project, and to grant
+   copyright and patent licenses thereto to the Copyright Holders.
+
+   If you have not already done so, please complete and sign, then scan and
+   email a pdf file of this Agreement to digits@nvidia.com.
+   Please read this document carefully before signing and keep a copy for
+   your records.
+
+   Corporation name:    ________________________________________________
+
+   Corporation address: ________________________________________________
+
+                        ________________________________________________
+
+                        ________________________________________________
+
+   Point of Contact:    ________________________________________________
+
+          E-Mail:       ________________________________________________
+
+          Telephone:    _____________________ Fax: _____________________
+
+
+   You accept and agree to the following terms and conditions for Your
+   present and future Contributions submitted to the Project. In
+   return, the Copyright Holders shall not use Your Contributions in a way
+   that is contrary to the public benefit or inconsistent with its nonprofit
+   status and bylaws in effect at the time of the Contribution. Except
+   for the license granted herein to the Copyright Holders and recipients of
+   software distributed by the Copyright Holders, You reserve all right, title,
+   and interest in and to Your Contributions.
+
+   1. Definitions.
+
+      "You" (or "Your") shall mean the copyright owner or legal entity
+      authorized by the copyright owner that is making this Agreement
+      with the Copyright Holders. For legal entities, the entity making a
+      Contribution and all other entities that control, are controlled by,
+      or are under common control with that entity are considered to be a
+      single Contributor. For the purposes of this definition, "control"
+      means (i) the power, direct or indirect, to cause the direction or
+      management of such entity, whether by contract or otherwise, or
+      (ii) ownership of fifty percent (50%) or more of the outstanding
+      shares, or (iii) beneficial ownership of such entity.
+
+      "Contribution" shall mean the code, documentation or other original
+      works of authorship expressly identified in Schedule B, as well as
+      any original work of authorship, including
+      any modifications or additions to an existing work, that is intentionally
+      submitted by You to the Copyright Holders for inclusion in, or
+      documentation of, any of the products owned or managed by the
+      Copyright Holders (the "Work"). For the purposes of this definition,
+      "submitted" means any form of electronic, verbal, or written
+      communication sent to the Copyright Holders or its representatives,
+      including but not limited to communication on electronic mailing
+      lists, source code control systems, and issue tracking systems
+      that are managed by, or on behalf of, the Copyright Holders for the
+      purpose of discussing and improving the Work, but excluding
+      communication that is conspicuously marked or otherwise designated
+      in writing by You as "Not a Contribution."
+
+   2. Grant of Copyright License. Subject to the terms and conditions
+      of this Agreement, You hereby grant to the Copyright Holders and to
+      recipients of software distributed by the Copyright Holders a
+      perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+      irrevocable copyright license to reproduce, prepare derivative works
+      of, publicly display, publicly perform, sublicense, and distribute
+      Your Contributions and such derivative works.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this Agreement, You hereby grant to the Copyright Holders and to
+      recipients of software distributed by the Copyright Holders
+      a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+      irrevocable (except as stated in this section) patent license
+      to make, have made, use, offer to sell, sell, import, and otherwise
+      transfer the Work, where such license applies only to those
+      patent claims licensable by You that are necessarily infringed
+      by Your Contribution(s) alone or by combination of Your Contribution(s)
+      with the Work to which such Contribution(s) were submitted.
+      If any entity institutes patent litigation against You or any
+      other entity (including a cross-claim or counterclaim in a lawsuit)
+      alleging that your Contribution, or the Work to which you have
+      contributed, constitutes direct or contributory patent infringement,
+      then any patent licenses granted to that entity under this Agreement
+      for that Contribution or Work shall terminate as of the date such
+      litigation is filed.
+
+   4. You represent that You are legally entitled to grant the above
+      license. You represent further that each employee of the
+      Corporation designated on Schedule A below (or in a subsequent
+      written modification to that Schedule) is authorized to submit
+      Contributions on behalf of the Corporation.
+
+   5. You represent that each of Your Contributions is Your original
+      creation (see section 7 for submissions on behalf of others).
+
+   6. You are not expected to provide support for Your Contributions,
+      except to the extent You desire to provide support. You may provide
+      support for free, for a fee, or not at all. Unless required by
+      applicable law or agreed to in writing, You provide Your
+      Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+      OF ANY KIND, either express or implied, including, without
+      limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
+      MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
+
+   7. Should You wish to submit work that is not Your original creation,
+      You may submit it to the Copyright Holders separately from any
+      Contribution, identifying the complete details of its source and
+      of any license or other restriction (including, but not limited
+      to, related patents, trademarks, and license agreements) of which
+      you are personally aware, and conspicuously marking the work as
+      "Submitted on behalf of a third-party: [named here]".
+
+   8. It is your responsibility to notify the Copyright Holders when any change
+      is required to the list of designated employees authorized to submit
+      Contributions on behalf of the Corporation, or to the Corporation's
+      Point of Contact with the Copyright Holders.
+
+
+
+   Please sign: __________________________________ Date: _______________
+
+   Title:       __________________________________
+
+   Corporation: __________________________________
+
+
+
+
+Schedule A
+
+   [Initial list of designated employees.  NB: authorization is not
+    tied to particular Contributions.]
+
+
+
+
+
+
+Schedule B
+
+   [Identification of optional concurrent software grant.  Would be
+    left blank or omitted if there is no concurrent software grant.]
+
+
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/README.md
@@ -0,0 +1,34 @@
+# NVIDIA GPU Monitoring Tools
+
+## NVML Go Bindings
+
+[NVIDIA Management Library (NVML)](https://developer.nvidia.com/nvidia-management-library-nvml) is a C-based API for monitoring and managing NVIDIA GPU devices. 
+NVML go bindings are taken from [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) with some improvements and additions. NVML headers are also added to the package to make it easy to use and build.
+
+### NVML Samples
+Three [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/nvml/README.md) are included to demonstrate how to use the NVML API.
+
+
+## DCGM Go Bindings
+
+[NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting.
+
+DCGM go bindings makes administering and monitoring containerized GPU applications easy.
+
+### DCGM Samples
+
+DCGM can be run in different modes, seven [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/README.md) and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) are included for showing how to use the DCGM API and run it in different modes.
+
+
+## DCGM exporter
+
+GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a simple shell script that starts nv-hostengine, reads GPU metrics every 1 second and converts it to a standard Prometheus format.
+
+Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md).
+
+## Issues and Contributing
+
+A signed copy of the [Contributor License Agreement](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/CLA) needs to be provided to <a href="mailto:digits@nvidia.com">digits@nvidia.com</a> before any change can be accepted.
+
+* Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new)
+* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/)
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
@@ -0,0 +1,634 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+package nvml
+
+// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
+// #include "nvml_dl.h"
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+const (
+	szDriver   = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
+	szName     = C.NVML_DEVICE_NAME_BUFFER_SIZE
+	szUUID     = C.NVML_DEVICE_UUID_BUFFER_SIZE
+	szProcs    = 32
+	szProcName = 64
+
+	XidCriticalError = C.nvmlEventTypeXidCriticalError
+)
+
+type handle struct{ dev C.nvmlDevice_t }
+type EventSet struct{ set C.nvmlEventSet_t }
+type Event struct {
+	UUID  *string
+	Etype uint64
+	Edata uint64
+}
+
+func uintPtr(c C.uint) *uint {
+	i := uint(c)
+	return &i
+}
+
+func uint64Ptr(c C.ulonglong) *uint64 {
+	i := uint64(c)
+	return &i
+}
+
+func stringPtr(c *C.char) *string {
+	s := C.GoString(c)
+	return &s
+}
+
+func errorString(ret C.nvmlReturn_t) error {
+	if ret == C.NVML_SUCCESS {
+		return nil
+	}
+	err := C.GoString(C.nvmlErrorString(ret))
+	return fmt.Errorf("nvml: %v", err)
+}
+
+func init_() error {
+	r := C.nvmlInit_dl()
+	if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
+		return errors.New("could not load NVML library")
+	}
+	return errorString(r)
+}
+
+func NewEventSet() EventSet {
+	var set C.nvmlEventSet_t
+	C.nvmlEventSetCreate(&set)
+
+	return EventSet{set}
+}
+
+func RegisterEvent(es EventSet, event int) error {
+	n, err := deviceGetCount()
+	if err != nil {
+		return err
+	}
+
+	var i uint
+	for i = 0; i < n; i++ {
+		h, err := deviceGetHandleByIndex(i)
+		if err != nil {
+			return err
+		}
+
+		r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
+		if r != C.NVML_SUCCESS {
+			return errorString(r)
+		}
+	}
+
+	return nil
+}
+
+func RegisterEventForDevice(es EventSet, event int, uuid string) error {
+	n, err := deviceGetCount()
+	if err != nil {
+		return err
+	}
+
+	var i uint
+	for i = 0; i < n; i++ {
+		h, err := deviceGetHandleByIndex(i)
+		if err != nil {
+			return err
+		}
+
+		duuid, err := h.deviceGetUUID()
+		if err != nil {
+			return err
+		}
+
+		if *duuid != uuid {
+			continue
+		}
+
+		r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
+		if r != C.NVML_SUCCESS {
+			return errorString(r)
+		}
+
+		return nil
+	}
+
+	return fmt.Errorf("nvml: device not found")
+}
+
+func DeleteEventSet(es EventSet) {
+	C.nvmlEventSetFree(es.set)
+}
+
+func WaitForEvent(es EventSet, timeout uint) (Event, error) {
+	var data C.nvmlEventData_t
+
+	r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
+	uuid, _ := handle{data.device}.deviceGetUUID()
+
+	return Event{
+			UUID:  uuid,
+			Etype: uint64(data.eventType),
+			Edata: uint64(data.eventData),
+		},
+		errorString(r)
+}
+
+func shutdown() error {
+	return errorString(C.nvmlShutdown_dl())
+}
+
+func systemGetDriverVersion() (string, error) {
+	var driver [szDriver]C.char
+
+	r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
+	return C.GoString(&driver[0]), errorString(r)
+}
+
+func systemGetProcessName(pid uint) (string, error) {
+	var proc [szProcName]C.char
+
+	r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
+	return C.GoString(&proc[0]), errorString(r)
+}
+
+func deviceGetCount() (uint, error) {
+	var n C.uint
+
+	r := C.nvmlDeviceGetCount(&n)
+	return uint(n), errorString(r)
+}
+
+func deviceGetHandleByIndex(idx uint) (handle, error) {
+	var dev C.nvmlDevice_t
+
+	r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
+	return handle{dev}, errorString(r)
+}
+
+func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
+	var level C.nvmlGpuTopologyLevel_t
+
+	r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
+	if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(C.uint(level)), errorString(r)
+}
+
+func (h handle) deviceGetName() (*string, error) {
+	var name [szName]C.char
+
+	r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return stringPtr(&name[0]), errorString(r)
+}
+
+func (h handle) deviceGetUUID() (*string, error) {
+	var uuid [szUUID]C.char
+
+	r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return stringPtr(&uuid[0]), errorString(r)
+}
+
+func (h handle) deviceGetPciInfo() (*string, error) {
+	var pci C.nvmlPciInfo_t
+
+	r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return stringPtr(&pci.busId[0]), errorString(r)
+}
+
+func (h handle) deviceGetMinorNumber() (*uint, error) {
+	var minor C.uint
+
+	r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(minor), errorString(r)
+}
+
+func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
+	var bar1 C.nvmlBAR1Memory_t
+
+	r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
+}
+
+func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
+	var power C.uint
+
+	r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(power), errorString(r)
+}
+
+func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
+	var sm, mem C.uint
+
+	r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
+	}
+	return uintPtr(sm), uintPtr(mem), errorString(r)
+}
+
+func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
+	var link C.uint
+
+	r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(link), errorString(r)
+}
+
+func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
+	var width C.uint
+
+	r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(width), errorString(r)
+}
+
+func (h handle) deviceGetPowerUsage() (*uint, error) {
+	var power C.uint
+
+	r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(power), errorString(r)
+}
+
+func (h handle) deviceGetTemperature() (*uint, error) {
+	var temp C.uint
+
+	r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(temp), errorString(r)
+}
+
+func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
+	var usage C.nvmlUtilization_t
+
+	r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
+}
+
+func (h handle) deviceGetEncoderUtilization() (*uint, error) {
+	var usage, sampling C.uint
+
+	r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(usage), errorString(r)
+}
+
+func (h handle) deviceGetDecoderUtilization() (*uint, error) {
+	var usage, sampling C.uint
+
+	r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil
+	}
+	return uintPtr(usage), errorString(r)
+}
+
+func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
+	var mem C.nvmlMemory_t
+
+	r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	err = errorString(r)
+	if r != C.NVML_SUCCESS {
+		return
+	}
+
+	totalMem = uint64Ptr(mem.total)
+	if totalMem != nil {
+		*totalMem /= 1024 * 1024 // MiB
+	}
+
+	devMem = DeviceMemory{
+		Used: uint64Ptr(mem.used),
+		Free: uint64Ptr(mem.free),
+	}
+
+	if devMem.Used != nil {
+		*devMem.Used /= 1024 * 1024 // MiB
+	}
+
+	if devMem.Free != nil {
+		*devMem.Free /= 1024 * 1024 // MiB
+	}
+	return
+}
+
+func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
+	var sm, mem C.uint
+
+	r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
+	}
+	return uintPtr(sm), uintPtr(mem), errorString(r)
+}
+
+func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
+	var l1, l2, mem C.ulonglong
+
+	r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+		C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+			C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+			C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
+	}
+	return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
+}
+
+func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
+	var rx, tx C.uint
+
+	r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	if r == C.NVML_SUCCESS {
+		r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
+	}
+	return uintPtr(rx), uintPtr(tx), errorString(r)
+}
+
+func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
+	var procs [szProcs]C.nvmlProcessInfo_t
+	var count = C.uint(szProcs)
+
+	r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	n := int(count)
+	pids := make([]uint, n)
+	mems := make([]uint64, n)
+	for i := 0; i < n; i++ {
+		pids[i] = uint(procs[i].pid)
+		mems[i] = uint64(procs[i].usedGpuMemory)
+	}
+	return pids, mems, errorString(r)
+}
+
+func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
+	var procs [szProcs]C.nvmlProcessInfo_t
+	var count = C.uint(szProcs)
+
+	r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return nil, nil, nil
+	}
+	n := int(count)
+	pids := make([]uint, n)
+	mems := make([]uint64, n)
+	for i := 0; i < n; i++ {
+		pids[i] = uint(procs[i].pid)
+		mems[i] = uint64(procs[i].usedGpuMemory)
+	}
+	return pids, mems, errorString(r)
+}
+
+func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
+	cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
+	if err != nil {
+		return nil, err
+	}
+
+	gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
+	if err != nil {
+		return nil, err
+	}
+
+	allPids := make(map[uint]ProcessInfo)
+
+	for i, pid := range cPids {
+		name, err := processName(pid)
+		if err != nil {
+			return nil, err
+		}
+		allPids[pid] = ProcessInfo{
+			PID:        pid,
+			Name:       name,
+			MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
+			Type:       Compute,
+		}
+
+	}
+
+	for i, pid := range gPids {
+		pInfo, exists := allPids[pid]
+		if exists {
+			pInfo.Type = ComputeAndGraphics
+			allPids[pid] = pInfo
+		} else {
+			name, err := processName(pid)
+			if err != nil {
+				return nil, err
+			}
+			allPids[pid] = ProcessInfo{
+				PID:        pid,
+				Name:       name,
+				MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
+				Type:       Graphics,
+			}
+		}
+	}
+
+	var processInfo []ProcessInfo
+	for _, v := range allPids {
+		processInfo = append(processInfo, v)
+	}
+	sort.Slice(processInfo, func(i, j int) bool {
+		return processInfo[i].PID < processInfo[j].PID
+	})
+
+	return processInfo, nil
+}
+
+func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
+	var clocksThrottleReasons C.ulonglong
+
+	r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
+
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return ThrottleReasonUnknown, nil
+	}
+
+	if r != C.NVML_SUCCESS {
+		return ThrottleReasonUnknown, errorString(r)
+	}
+
+	switch clocksThrottleReasons {
+	case C.nvmlClocksThrottleReasonGpuIdle:
+		reason = ThrottleReasonGpuIdle
+	case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
+		reason = ThrottleReasonApplicationsClocksSetting
+	case C.nvmlClocksThrottleReasonSwPowerCap:
+		reason = ThrottleReasonSwPowerCap
+	case C.nvmlClocksThrottleReasonHwSlowdown:
+		reason = ThrottleReasonHwSlowdown
+	case C.nvmlClocksThrottleReasonSyncBoost:
+		reason = ThrottleReasonSyncBoost
+	case C.nvmlClocksThrottleReasonSwThermalSlowdown:
+		reason = ThrottleReasonSwThermalSlowdown
+	case C.nvmlClocksThrottleReasonHwThermalSlowdown:
+		reason = ThrottleReasonHwThermalSlowdown
+	case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
+		reason = ThrottleReasonHwPowerBrakeSlowdown
+	case C.nvmlClocksThrottleReasonDisplayClockSetting:
+		reason = ThrottleReasonDisplayClockSetting
+	case C.nvmlClocksThrottleReasonNone:
+		reason = ThrottleReasonNone
+	}
+	return
+}
+
+func (h handle) getPerformanceState() (PerfState, error) {
+	var pstate C.nvmlPstates_t
+
+	r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
+
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return PerfStateUnknown, nil
+	}
+
+	if r != C.NVML_SUCCESS {
+		return PerfStateUnknown, errorString(r)
+	}
+	return PerfState(pstate), nil
+}
+
+func processName(pid uint) (string, error) {
+	f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
+	d, err := ioutil.ReadFile(f)
+
+	if err != nil {
+		// TOCTOU: process terminated
+		if os.IsNotExist(err) {
+			return "", nil
+		}
+		return "", err
+	}
+	return strings.TrimSuffix(string(d), "\n"), err
+}
+
+func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
+	var mode C.nvmlEnableState_t
+	var buffer C.uint
+
+	r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	if r != C.NVML_SUCCESS {
+		return accountingInfo, errorString(r)
+	}
+
+	r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	if r != C.NVML_SUCCESS {
+		return accountingInfo, errorString(r)
+	}
+
+	accountingInfo = Accounting{
+		Mode:       ModeState(mode),
+		BufferSize: uintPtr(buffer),
+	}
+	return
+}
+
+func (h handle) getDisplayInfo() (display Display, err error) {
+	var mode, isActive C.nvmlEnableState_t
+
+	r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+
+	if r != C.NVML_SUCCESS {
+		return display, errorString(r)
+	}
+
+	r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+	if r != C.NVML_SUCCESS {
+		return display, errorString(r)
+	}
+	display = Display{
+		Mode:   ModeState(mode),
+		Active: ModeState(isActive),
+	}
+	return
+}
+
+func (h handle) getPeristenceMode() (state ModeState, err error) {
+	var mode C.nvmlEnableState_t
+
+	r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
+	if r == C.NVML_ERROR_NOT_SUPPORTED {
+		return
+	}
+	return ModeState(mode), errorString(r)
+}
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
@@ -0,0 +1,533 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+package nvml
+
+// #include "nvml_dl.h"
+import "C"
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+)
+
+var (
+	ErrCPUAffinity        = errors.New("failed to retrieve CPU affinity")
+	ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
+	ErrUnsupportedGPU     = errors.New("unsupported GPU device")
+)
+
+type ModeState uint
+
+const (
+	Enabled ModeState = iota
+	Disabled
+)
+
+func (m ModeState) String() string {
+	switch m {
+	case Enabled:
+		return "Enabled"
+	case Disabled:
+		return "Disabled"
+	}
+	return "N/A"
+}
+
+type Display struct {
+	Mode   ModeState
+	Active ModeState
+}
+
+type Accounting struct {
+	Mode       ModeState
+	BufferSize *uint
+}
+
+type DeviceMode struct {
+	DisplayInfo    Display
+	Persistence    ModeState
+	AccountingInfo Accounting
+}
+
+type ThrottleReason uint
+
+const (
+	ThrottleReasonGpuIdle ThrottleReason = iota
+	ThrottleReasonApplicationsClocksSetting
+	ThrottleReasonSwPowerCap
+	ThrottleReasonHwSlowdown
+	ThrottleReasonSyncBoost
+	ThrottleReasonSwThermalSlowdown
+	ThrottleReasonHwThermalSlowdown
+	ThrottleReasonHwPowerBrakeSlowdown
+	ThrottleReasonDisplayClockSetting
+	ThrottleReasonNone
+	ThrottleReasonUnknown
+)
+
+func (r ThrottleReason) String() string {
+	switch r {
+	case ThrottleReasonGpuIdle:
+		return "Gpu Idle"
+	case ThrottleReasonApplicationsClocksSetting:
+		return "Applications Clocks Setting"
+	case ThrottleReasonSwPowerCap:
+		return "SW Power Cap"
+	case ThrottleReasonHwSlowdown:
+		return "HW Slowdown"
+	case ThrottleReasonSyncBoost:
+		return "Sync Boost"
+	case ThrottleReasonSwThermalSlowdown:
+		return "SW Thermal Slowdown"
+	case ThrottleReasonHwThermalSlowdown:
+		return "HW Thermal Slowdown"
+	case ThrottleReasonHwPowerBrakeSlowdown:
+		return "HW Power Brake Slowdown"
+	case ThrottleReasonDisplayClockSetting:
+		return "Display Clock Setting"
+	case ThrottleReasonNone:
+		return "No clocks throttling"
+	}
+	return "N/A"
+}
+
+type PerfState uint
+
+const (
+	PerfStateMax     = 0
+	PerfStateMin     = 15
+	PerfStateUnknown = 32
+)
+
+func (p PerfState) String() string {
+	if p >= PerfStateMax && p <= PerfStateMin {
+		return fmt.Sprintf("P%d", p)
+	}
+	return "Unknown"
+}
+
+type ProcessType uint
+
+const (
+	Compute ProcessType = iota
+	Graphics
+	ComputeAndGraphics
+)
+
+func (t ProcessType) String() string {
+	typ := "C+G"
+	if t == Compute {
+		typ = "C"
+	} else if t == Graphics {
+		typ = "G"
+	}
+	return typ
+}
+
+type P2PLinkType uint
+
+const (
+	P2PLinkUnknown P2PLinkType = iota
+	P2PLinkCrossCPU
+	P2PLinkSameCPU
+	P2PLinkHostBridge
+	P2PLinkMultiSwitch
+	P2PLinkSingleSwitch
+	P2PLinkSameBoard
+)
+
+type P2PLink struct {
+	BusID string
+	Link  P2PLinkType
+}
+
+func (t P2PLinkType) String() string {
+	switch t {
+	case P2PLinkCrossCPU:
+		return "Cross CPU socket"
+	case P2PLinkSameCPU:
+		return "Same CPU socket"
+	case P2PLinkHostBridge:
+		return "Host PCI bridge"
+	case P2PLinkMultiSwitch:
+		return "Multiple PCI switches"
+	case P2PLinkSingleSwitch:
+		return "Single PCI switch"
+	case P2PLinkSameBoard:
+		return "Same board"
+	case P2PLinkUnknown:
+	}
+	return "N/A"
+}
+
+type ClockInfo struct {
+	Cores  *uint
+	Memory *uint
+}
+
+type PCIInfo struct {
+	BusID     string
+	BAR1      *uint64
+	Bandwidth *uint
+}
+
+type Device struct {
+	handle
+
+	UUID        string
+	Path        string
+	Model       *string
+	Power       *uint
+	Memory      *uint64
+	CPUAffinity *uint
+	PCI         PCIInfo
+	Clocks      ClockInfo
+	Topology    []P2PLink
+}
+
+type UtilizationInfo struct {
+	GPU     *uint
+	Memory  *uint
+	Encoder *uint
+	Decoder *uint
+}
+
+type PCIThroughputInfo struct {
+	RX *uint
+	TX *uint
+}
+
+type PCIStatusInfo struct {
+	BAR1Used   *uint64
+	Throughput PCIThroughputInfo
+}
+
+type ECCErrorsInfo struct {
+	L1Cache *uint64
+	L2Cache *uint64
+	Device  *uint64
+}
+
+type DeviceMemory struct {
+	Used *uint64
+	Free *uint64
+}
+
+type MemoryInfo struct {
+	Global    DeviceMemory
+	ECCErrors ECCErrorsInfo
+}
+
+type ProcessInfo struct {
+	PID        uint
+	Name       string
+	MemoryUsed uint64
+	Type       ProcessType
+}
+
+type DeviceStatus struct {
+	Power       *uint
+	Temperature *uint
+	Utilization UtilizationInfo
+	Memory      MemoryInfo
+	Clocks      ClockInfo
+	PCI         PCIStatusInfo
+	Processes   []ProcessInfo
+	Throttle    ThrottleReason
+	Performance PerfState
+}
+
+func assert(err error) {
+	if err != nil {
+		panic(err)
+	}
+}
+
+func Init() error {
+	return init_()
+}
+
+func Shutdown() error {
+	return shutdown()
+}
+
+func GetDeviceCount() (uint, error) {
+	return deviceGetCount()
+}
+
+func GetDriverVersion() (string, error) {
+	return systemGetDriverVersion()
+}
+
+func numaNode(busid string) (uint, error) {
+	// discard leading zeros of busid
+	b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
+	if err != nil {
+		// XXX report node 0 if NUMA support isn't enabled
+		return 0, nil
+	}
+	node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
+	if err != nil {
+		return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
+	}
+	if node < 0 {
+		node = 0 // XXX report node 0 instead of NUMA_NO_NODE
+	}
+	return uint(node), nil
+}
+
+func pciBandwidth(gen, width *uint) *uint {
+	m := map[uint]uint{
+		1: 250, // MB/s
+		2: 500,
+		3: 985,
+		4: 1969,
+	}
+	if gen == nil || width == nil {
+		return nil
+	}
+	bw := m[*gen] * *width
+	return &bw
+}
+
+func NewDevice(idx uint) (device *Device, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	h, err := deviceGetHandleByIndex(idx)
+	assert(err)
+	model, err := h.deviceGetName()
+	assert(err)
+	uuid, err := h.deviceGetUUID()
+	assert(err)
+	minor, err := h.deviceGetMinorNumber()
+	assert(err)
+	power, err := h.deviceGetPowerManagementLimit()
+	assert(err)
+	totalMem, _, err := h.deviceGetMemoryInfo()
+	assert(err)
+	busid, err := h.deviceGetPciInfo()
+	assert(err)
+	bar1, _, err := h.deviceGetBAR1MemoryInfo()
+	assert(err)
+	pcig, err := h.deviceGetMaxPcieLinkGeneration()
+	assert(err)
+	pciw, err := h.deviceGetMaxPcieLinkWidth()
+	assert(err)
+	ccore, cmem, err := h.deviceGetMaxClockInfo()
+	assert(err)
+
+	if minor == nil || busid == nil || uuid == nil {
+		return nil, ErrUnsupportedGPU
+	}
+	path := fmt.Sprintf("/dev/nvidia%d", *minor)
+	node, err := numaNode(*busid)
+	assert(err)
+
+	device = &Device{
+		handle:      h,
+		UUID:        *uuid,
+		Path:        path,
+		Model:       model,
+		Power:       power,
+		Memory:      totalMem,
+		CPUAffinity: &node,
+		PCI: PCIInfo{
+			BusID:     *busid,
+			BAR1:      bar1,
+			Bandwidth: pciBandwidth(pcig, pciw), // MB/s
+		},
+		Clocks: ClockInfo{
+			Cores:  ccore, // MHz
+			Memory: cmem,  // MHz
+		},
+	}
+	if power != nil {
+		*device.Power /= 1000 // W
+	}
+	if bar1 != nil {
+		*device.PCI.BAR1 /= 1024 * 1024 // MiB
+	}
+	return
+}
+
+func NewDeviceLite(idx uint) (device *Device, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	h, err := deviceGetHandleByIndex(idx)
+	assert(err)
+	uuid, err := h.deviceGetUUID()
+	assert(err)
+	minor, err := h.deviceGetMinorNumber()
+	assert(err)
+	busid, err := h.deviceGetPciInfo()
+	assert(err)
+
+	if minor == nil || busid == nil || uuid == nil {
+		return nil, ErrUnsupportedGPU
+	}
+	path := fmt.Sprintf("/dev/nvidia%d", *minor)
+
+	device = &Device{
+		handle: h,
+		UUID:   *uuid,
+		Path:   path,
+		PCI: PCIInfo{
+			BusID: *busid,
+		},
+	}
+	return
+}
+
+func (d *Device) Status() (status *DeviceStatus, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	power, err := d.deviceGetPowerUsage()
+	assert(err)
+	temp, err := d.deviceGetTemperature()
+	assert(err)
+	ugpu, umem, err := d.deviceGetUtilizationRates()
+	assert(err)
+	uenc, err := d.deviceGetEncoderUtilization()
+	assert(err)
+	udec, err := d.deviceGetDecoderUtilization()
+	assert(err)
+	_, devMem, err := d.deviceGetMemoryInfo()
+	assert(err)
+	ccore, cmem, err := d.deviceGetClockInfo()
+	assert(err)
+	_, bar1, err := d.deviceGetBAR1MemoryInfo()
+	assert(err)
+	el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
+	assert(err)
+	pcirx, pcitx, err := d.deviceGetPcieThroughput()
+	assert(err)
+	throttle, err := d.getClocksThrottleReasons()
+	assert(err)
+	perfState, err := d.getPerformanceState()
+	assert(err)
+	processInfo, err := d.deviceGetAllRunningProcesses()
+	assert(err)
+
+	status = &DeviceStatus{
+		Power:       power,
+		Temperature: temp, // °C
+		Utilization: UtilizationInfo{
+			GPU:     ugpu, // %
+			Memory:  umem, // %
+			Encoder: uenc, // %
+			Decoder: udec, // %
+		},
+		Memory: MemoryInfo{
+			Global: devMem,
+			ECCErrors: ECCErrorsInfo{
+				L1Cache: el1,
+				L2Cache: el2,
+				Device:  emem,
+			},
+		},
+		Clocks: ClockInfo{
+			Cores:  ccore, // MHz
+			Memory: cmem,  // MHz
+		},
+		PCI: PCIStatusInfo{
+			BAR1Used: bar1,
+			Throughput: PCIThroughputInfo{
+				RX: pcirx,
+				TX: pcitx,
+			},
+		},
+		Throttle:    throttle,
+		Performance: perfState,
+		Processes:   processInfo,
+	}
+	if power != nil {
+		*status.Power /= 1000 // W
+	}
+	if bar1 != nil {
+		*status.PCI.BAR1Used /= 1024 * 1024 // MiB
+	}
+	if pcirx != nil {
+		*status.PCI.Throughput.RX /= 1000 // MB/s
+	}
+	if pcitx != nil {
+		*status.PCI.Throughput.TX /= 1000 // MB/s
+	}
+	return
+}
+
+func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
+	level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
+	if err != nil || level == nil {
+		return P2PLinkUnknown, err
+	}
+
+	switch *level {
+	case C.NVML_TOPOLOGY_INTERNAL:
+		link = P2PLinkSameBoard
+	case C.NVML_TOPOLOGY_SINGLE:
+		link = P2PLinkSingleSwitch
+	case C.NVML_TOPOLOGY_MULTIPLE:
+		link = P2PLinkMultiSwitch
+	case C.NVML_TOPOLOGY_HOSTBRIDGE:
+		link = P2PLinkHostBridge
+	case C.NVML_TOPOLOGY_CPU:
+		link = P2PLinkSameCPU
+	case C.NVML_TOPOLOGY_SYSTEM:
+		link = P2PLinkCrossCPU
+	default:
+		err = ErrUnsupportedP2PLink
+	}
+	return
+}
+
+func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
+	return d.handle.deviceGetComputeRunningProcesses()
+}
+
+func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
+	return d.handle.deviceGetGraphicsRunningProcesses()
+}
+
+func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
+	return d.handle.deviceGetAllRunningProcesses()
+}
+
+func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = r.(error)
+		}
+	}()
+
+	display, err := d.getDisplayInfo()
+	assert(err)
+
+	p, err := d.getPeristenceMode()
+	assert(err)
+
+	accounting, err := d.getAccountingInfo()
+	assert(err)
+
+	mode = &DeviceMode{
+		DisplayInfo:    display,
+		Persistence:    p,
+		AccountingInfo: accounting,
+	}
+	return
+}
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
@@ -0,0 +1,46 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+#include <stddef.h>
+#include <dlfcn.h>
+
+#include "nvml_dl.h"
+
+#define DLSYM(x, sym)                           \
+do {                                            \
+    dlerror();				        \
+    x = dlsym(handle, #sym);                    \
+    if (dlerror() != NULL) {                    \
+        return (NVML_ERROR_FUNCTION_NOT_FOUND); \
+    }                                           \
+} while (0)
+
+typedef nvmlReturn_t (*nvmlSym_t)();
+
+static void *handle;
+
+nvmlReturn_t NVML_DL(nvmlInit)(void)
+{
+    handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
+    if (handle == NULL) {
+	return (NVML_ERROR_LIBRARY_NOT_FOUND);
+    }
+    return (nvmlInit());
+}
+
+nvmlReturn_t NVML_DL(nvmlShutdown)(void)
+{
+    nvmlReturn_t r = nvmlShutdown();
+    if (r != NVML_SUCCESS) {
+	return (r);
+    }
+    return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
+}
+
+nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
+  nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
+{
+    nvmlSym_t sym;
+
+    DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
+    return ((*sym)(dev1, dev2, info));
+}
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
+++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+#ifndef _NVML_DL_H_
+#define _NVML_DL_H_
+
+#include "nvml.h"
+
+#define NVML_DL(x) x##_dl
+
+extern nvmlReturn_t NVML_DL(nvmlInit)(void);
+extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
+extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
+  nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
+
+#endif // _NVML_DL_H_
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@@ -9,6 +9,8 @@
 		{"path":"github.com/Azure/go-ansiterm/winterm","checksumSHA1":"jBimnggjIiFUjaImNoJhSVLtdzw=","revision":"fa152c58bc15761d0200cb75fe958b89a9d4888e","revisionTime":"2016-06-22T17:32:16Z"},
 		{"path":"github.com/DataDog/datadog-go/statsd","checksumSHA1":"WvApwvvSe3i/3KO8300dyeFmkbI=","revision":"b10af4b12965a1ad08d164f57d14195b4140d8de","revisionTime":"2017-08-09T10:47:06Z"},
 		{"path":"github.com/Microsoft/go-winio","checksumSHA1":"AzjRkOQtVBTwIw4RJLTygFhJs3s=","revision":"f533f7a102197536779ea3a8cb881d639e21ec5a","revisionTime":"2017-05-24T00:36:31Z"},
+		{"path":"github.com/NVIDIA/gpu-monitoring-tools","checksumSHA1":"kF1vk+8Xvb3nGBiw9+qbUc0SZ4M=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
+		{"path":"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml","checksumSHA1":"P8FATSSgpe5A17FyPrGpsX95Xw8=","revision":"86f2a9fac6c5b597dc494420005144b8ef7ec9fb","revisionTime":"2018-08-29T22:20:09Z"},
 		{"path":"github.com/NYTimes/gziphandler","checksumSHA1":"jktW57+vJsziNVPeXMCoujTzdW4=","revision":"97ae7fbaf81620fe97840685304a78a306a39c64","revisionTime":"2017-09-16T00:36:49Z"},
 		{"path":"github.com/Nvveen/Gotty","checksumSHA1":"Aqy8/FoAIidY/DeQ5oTYSZ4YFVc=","revision":"cd527374f1e5bff4938207604a14f2e38a9cf512","revisionTime":"2012-06-04T00:48:16Z"},
 		{"path":"github.com/RackSec/srslog","checksumSHA1":"OTN4c1F0p+mEG2CpkU1Kuavupf0=","revision":"259aed10dfa74ea2961eddd1d9847619f6e98837","revisionTime":"2016-01-20T22:33:50Z"},