nvidia package restructue + build non-linux

This commit is contained in:
Alex Dadgar
2018-10-05 13:50:18 -07:00
parent 3817ce3dd5
commit 7972221db9
14 changed files with 183 additions and 114 deletions

View File

@@ -3,8 +3,8 @@ package main
import (
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia"
"github.com/hashicorp/nomad/plugins"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia"
)
func main() {

View File

@@ -9,9 +9,9 @@ import (
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
"github.com/hashicorp/nomad/plugins/shared/hclspec"
)
@@ -73,9 +73,9 @@ type NvidiaDevice struct {
// nvmlClient is used to get data from nvidia
nvmlClient nvml.NvmlClient
// nvmlClientInitializationError holds an error retrieved during
// initErr holds an error retrieved during
// nvmlClient initialization
nvmlClientInitializationError error
initErr error
// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
ignoredGPUIDs map[string]struct{}
@@ -96,17 +96,17 @@ type NvidiaDevice struct {
// NewNvidiaDevice returns a new nvidia device plugin.
func NewNvidiaDevice(log log.Logger) *NvidiaDevice {
nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient()
nvmlClient, err := nvml.NewNvmlClient()
logger := log.Named(pluginName)
if nvmlClientInitializationError != nil {
logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError)
if err != nil && err.Error() != nvml.UnavailableLib.Error() {
logger.Error("unable to initialize Nvidia driver", "reason", err)
}
return &NvidiaDevice{
logger: logger,
devices: make(map[string]struct{}),
ignoredGPUIDs: make(map[string]struct{}),
nvmlClient: nvmlClient,
nvmlClientInitializationError: nvmlClientInitializationError,
logger: logger,
devices: make(map[string]struct{}),
ignoredGPUIDs: make(map[string]struct{}),
nvmlClient: nvmlClient,
initErr: err,
}
}

View File

@@ -3,9 +3,8 @@ package nvidia
import (
"testing"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/plugins/device"
"github.com/stretchr/testify/require"
)

View File

@@ -5,8 +5,8 @@ import (
"fmt"
"time"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
)
const (
@@ -26,11 +26,16 @@ const (
func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
defer close(devices)
if d.nvmlClientInitializationError != nil {
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError)
// write empty fingerprint response to let server know that there are
// no working Nvidia GPU units
devices <- device.NewFingerprint()
if d.initErr != nil {
if d.initErr.Error() != nvml.UnavailableLib.Error() {
d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr)
devices <- device.NewFingerprintError(d.initErr)
} else {
// write empty fingerprint response to let server know that there are
// no working Nvidia GPU units
devices <- device.NewFingerprint()
}
return
}
@@ -51,7 +56,6 @@ func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.F
// writeFingerprintToChannel makes nvml call and writes response to channel
func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
fingerprintData, err := d.nvmlClient.GetFingerprintData()
if err != nil {
d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
devices <- device.NewFingerprintError(err)

View File

@@ -7,9 +7,9 @@ import (
"testing"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
"github.com/stretchr/testify/require"
)
@@ -197,8 +197,10 @@ func TestIgnoreFingerprintedDevices(t *testing.T) {
ExpectedResult: nil,
},
} {
actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds)
require.New(t).Equal(testCase.ExpectedResult, actualResult)
t.Run(testCase.Name, func(t *testing.T) {
actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds)
require.New(t).Equal(testCase.ExpectedResult, actualResult)
})
}
}
@@ -342,12 +344,14 @@ func TestCheckFingerprintUpdates(t *testing.T) {
DeviceMapAfterMethodCall: map[string]struct{}{},
},
} {
actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices)
req := require.New(t)
// check that function returns valid "updated / not updated" state
req.Equal(testCase.ExpectedResult, actualResult)
// check that function propely updates devices map
req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall)
t.Run(testCase.Name, func(t *testing.T) {
actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices)
req := require.New(t)
// check that function returns valid "updated / not updated" state
req.Equal(testCase.ExpectedResult, actualResult)
// check that function propely updates devices map
req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall)
})
}
}
@@ -554,8 +558,10 @@ func TestAttributesFromFingerprintDeviceData(t *testing.T) {
},
},
} {
actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData)
require.New(t).Equal(testCase.ExpectedResult, actualResult)
t.Run(testCase.Name, func(t *testing.T) {
actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData)
require.New(t).Equal(testCase.ExpectedResult, actualResult)
})
}
}
@@ -715,8 +721,10 @@ func TestDeviceGroupFromFingerprintData(t *testing.T) {
ExpectedResult: nil,
},
} {
actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes)
require.New(t).Equal(testCase.ExpectedResult, actualResult)
t.Run(testCase.Name, func(t *testing.T) {
actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes)
require.New(t).Equal(testCase.ExpectedResult, actualResult)
})
}
}
@@ -1070,20 +1078,22 @@ func TestWriteFingerprintToChannel(t *testing.T) {
},
},
} {
channel := make(chan *device.FingerprintResponse, 1)
testCase.Device.writeFingerprintToChannel(channel)
actualResult := <-channel
// writeFingerprintToChannel iterates over map keys
// and insterts results to an array, so order of elements in output array
// may be different
// actualResult, expectedResult arrays has to be sorted firsted
sort.Slice(actualResult.Devices, func(i, j int) bool {
return actualResult.Devices[i].Name < actualResult.Devices[j].Name
t.Run(testCase.Name, func(t *testing.T) {
channel := make(chan *device.FingerprintResponse, 1)
testCase.Device.writeFingerprintToChannel(channel)
actualResult := <-channel
// writeFingerprintToChannel iterates over map keys
// and insterts results to an array, so order of elements in output array
// may be different
// actualResult, expectedResult arrays has to be sorted firsted
sort.Slice(actualResult.Devices, func(i, j int) bool {
return actualResult.Devices[i].Name < actualResult.Devices[j].Name
})
sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool {
return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name
})
require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult)
})
sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool {
return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name
})
require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult)
}
}
@@ -1097,7 +1107,7 @@ func TestFingerprint(t *testing.T) {
{
Name: "Check that working driver returns valid fingeprint data",
Device: &NvidiaDevice{
nvmlClientInitializationError: nil,
initErr: nil,
nvmlClient: &MockNvmlClient{
FingerprintResponseReturned: &nvml.FingerprintData{
DriverVersion: "1",
@@ -1197,9 +1207,9 @@ func TestFingerprint(t *testing.T) {
},
},
{
Name: "Check that not working driver returns empty fingeprint data",
Name: "Check that not working driver returns error fingeprint data",
Device: &NvidiaDevice{
nvmlClientInitializationError: errors.New(""),
initErr: errors.New("foo"),
nvmlClient: &MockNvmlClient{
FingerprintResponseReturned: &nvml.FingerprintData{
DriverVersion: "1",
@@ -1230,14 +1240,18 @@ func TestFingerprint(t *testing.T) {
},
logger: hclog.NewNullLogger(),
},
ExpectedWriteToChannel: &device.FingerprintResponse{},
ExpectedWriteToChannel: &device.FingerprintResponse{
Error: errors.New("foo"),
},
},
} {
outCh := make(chan *device.FingerprintResponse)
ctx, cancel := context.WithCancel(context.Background())
go testCase.Device.fingerprint(ctx, outCh)
result := <-outCh
cancel()
require.New(t).Equal(result, testCase.ExpectedWriteToChannel)
t.Run(testCase.Name, func(t *testing.T) {
outCh := make(chan *device.FingerprintResponse)
ctx, cancel := context.WithCancel(context.Background())
go testCase.Device.fingerprint(ctx, outCh)
result := <-outCh
cancel()
require.New(t).Equal(result, testCase.ExpectedWriteToChannel)
})
}
}

View File

@@ -0,0 +1,33 @@
// +build !linux
package nvml
// Initialize nvml library by locating nvml shared object file and calling ldopen
func (n *nvmlDriver) Initialize() error {
return UnavailableLib
}
// Shutdown stops any further interaction with nvml
func (n *nvmlDriver) Shutdown() error {
return UnavailableLib
}
// SystemDriverVersion returns installed driver version
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
return "", UnavailableLib
}
// DeviceCount reports number of available GPU devices
func (n *nvmlDriver) DeviceCount() (uint, error) {
return 0, UnavailableLib
}
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
return nil, UnavailableLib
}
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
return nil, nil, UnavailableLib
}

View File

@@ -4,59 +4,6 @@ import (
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
)
// DeviceInfo represents nvml device data
// this struct is returned by NvmlDriver DeviceInfoByIndex and
// DeviceInfoAndStatusByIndex methods
type DeviceInfo struct {
// The following fields are guaranteed to be retrieved from nvml
UUID string
PCIBusID string
DisplayState string
PersistenceMode string
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
Name *string
MemoryMiB *uint64
PowerW *uint
BAR1MiB *uint64
PCIBandwidthMBPerS *uint
CoresClockMHz *uint
MemoryClockMHz *uint
}
// DeviceStatus represents nvml device status
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
type DeviceStatus struct {
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
PowerUsageW *uint
TemperatureC *uint
GPUUtilization *uint // %
MemoryUtilization *uint // %
EncoderUtilization *uint // %
DecoderUtilization *uint // %
BAR1UsedMiB *uint64
UsedMemoryMiB *uint64
ECCErrorsL1Cache *uint64
ECCErrorsL2Cache *uint64
ECCErrorsDevice *uint64
}
// NvmlDriver represents set of methods to query nvml library
type NvmlDriver interface {
Initialize() error
Shutdown() error
SystemDriverVersion() (string, error)
DeviceCount() (uint, error)
DeviceInfoByIndex(uint) (*DeviceInfo, error)
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
}
// nvmlDriver implements NvmlDriver
// Users are required to call Initialize method before using any other methods
type nvmlDriver struct{}
// Initialize nvml library by locating nvml shared object file and calling ldopen
func (n *nvmlDriver) Initialize() error {
return nvml.Init()

View File

@@ -0,0 +1,61 @@
package nvml
import "errors"
var (
// UnavailableLib is returned when the nvml library could not be loaded.
UnavailableLib = errors.New("could not load NVML library")
)
// nvmlDriver implements NvmlDriver
// Users are required to call Initialize method before using any other methods
type nvmlDriver struct{}
// NvmlDriver represents set of methods to query nvml library
type NvmlDriver interface {
Initialize() error
Shutdown() error
SystemDriverVersion() (string, error)
DeviceCount() (uint, error)
DeviceInfoByIndex(uint) (*DeviceInfo, error)
DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
}
// DeviceInfo represents nvml device data
// this struct is returned by NvmlDriver DeviceInfoByIndex and
// DeviceInfoAndStatusByIndex methods
type DeviceInfo struct {
// The following fields are guaranteed to be retrieved from nvml
UUID string
PCIBusID string
DisplayState string
PersistenceMode string
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
Name *string
MemoryMiB *uint64
PowerW *uint
BAR1MiB *uint64
PCIBandwidthMBPerS *uint
CoresClockMHz *uint
MemoryClockMHz *uint
}
// DeviceStatus represents nvml device status
// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
type DeviceStatus struct {
// The following fields can be nil after call to nvml, because nvml was
// not able to retrieve this fields for specific nvidia card
PowerUsageW *uint
TemperatureC *uint
GPUUtilization *uint // %
MemoryUtilization *uint // %
EncoderUtilization *uint // %
DecoderUtilization *uint // %
BAR1UsedMiB *uint64
UsedMemoryMiB *uint64
ECCErrorsL1Cache *uint64
ECCErrorsL2Cache *uint64
ECCErrorsDevice *uint64
}

View File

@@ -4,8 +4,8 @@ import (
"context"
"time"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
)
const (
@@ -53,8 +53,12 @@ const (
func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) {
defer close(stats)
if d.nvmlClientInitializationError != nil {
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError)
if d.initErr != nil {
if d.initErr.Error() != nvml.UnavailableLib.Error() {
d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr)
stats <- device.NewStatsError(d.initErr)
}
return
}

View File

@@ -7,9 +7,9 @@ import (
"time"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml"
"github.com/stretchr/testify/require"
)

View File

@@ -141,6 +141,13 @@ type StatsResponse struct {
Error error
}
// NewStatsError takes an error and returns a stats response
func NewStatsError(err error) *StatsResponse {
return &StatsResponse{
Error: err,
}
}
// DeviceGroupStats contains statistics for each device of a particular
// device group, identified by the vendor, type and name of the device.
type DeviceGroupStats struct {