diff --git a/plugins/device/cmd/nvidia/README.md b/devices/gpu/nvidia/README.md similarity index 100% rename from plugins/device/cmd/nvidia/README.md rename to devices/gpu/nvidia/README.md diff --git a/plugins/device/cmd/nvidia/cmd/main.go b/devices/gpu/nvidia/cmd/main.go similarity index 84% rename from plugins/device/cmd/nvidia/cmd/main.go rename to devices/gpu/nvidia/cmd/main.go index 1b5b0c41c..1f48a3450 100644 --- a/plugins/device/cmd/nvidia/cmd/main.go +++ b/devices/gpu/nvidia/cmd/main.go @@ -3,8 +3,8 @@ package main import ( log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/devices/gpu/nvidia" "github.com/hashicorp/nomad/plugins" - "github.com/hashicorp/nomad/plugins/device/cmd/nvidia" ) func main() { diff --git a/plugins/device/cmd/nvidia/device.go b/devices/gpu/nvidia/device.go similarity index 90% rename from plugins/device/cmd/nvidia/device.go rename to devices/gpu/nvidia/device.go index 2613d8e77..8d3f0aeca 100644 --- a/plugins/device/cmd/nvidia/device.go +++ b/devices/gpu/nvidia/device.go @@ -9,9 +9,9 @@ import ( log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" "github.com/hashicorp/nomad/plugins/shared/hclspec" ) @@ -73,9 +73,9 @@ type NvidiaDevice struct { // nvmlClient is used to get data from nvidia nvmlClient nvml.NvmlClient - // nvmlClientInitializationError holds an error retrieved during + // initErr holds an error retrieved during // nvmlClient initialization - nvmlClientInitializationError error + initErr error // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad ignoredGPUIDs map[string]struct{} @@ -96,17 +96,17 @@ type NvidiaDevice struct { // NewNvidiaDevice returns a new nvidia device plugin. func NewNvidiaDevice(log log.Logger) *NvidiaDevice { - nvmlClient, nvmlClientInitializationError := nvml.NewNvmlClient() + nvmlClient, err := nvml.NewNvmlClient() logger := log.Named(pluginName) - if nvmlClientInitializationError != nil { - logger.Error("unable to initialize Nvidia driver", "error", nvmlClientInitializationError) + if err != nil && err.Error() != nvml.UnavailableLib.Error() { + logger.Error("unable to initialize Nvidia driver", "reason", err) } return &NvidiaDevice{ - logger: logger, - devices: make(map[string]struct{}), - ignoredGPUIDs: make(map[string]struct{}), - nvmlClient: nvmlClient, - nvmlClientInitializationError: nvmlClientInitializationError, + logger: logger, + devices: make(map[string]struct{}), + ignoredGPUIDs: make(map[string]struct{}), + nvmlClient: nvmlClient, + initErr: err, } } diff --git a/plugins/device/cmd/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go similarity index 97% rename from plugins/device/cmd/nvidia/device_test.go rename to devices/gpu/nvidia/device_test.go index b1fa4b17a..e6daec967 100644 --- a/plugins/device/cmd/nvidia/device_test.go +++ b/devices/gpu/nvidia/device_test.go @@ -3,9 +3,8 @@ package nvidia import ( "testing" - "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" - hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" "github.com/hashicorp/nomad/plugins/device" "github.com/stretchr/testify/require" ) diff --git a/plugins/device/cmd/nvidia/fingerprint.go b/devices/gpu/nvidia/fingerprint.go similarity index 94% rename from plugins/device/cmd/nvidia/fingerprint.go rename to devices/gpu/nvidia/fingerprint.go index 171b01844..15e2e9c4a 100644 --- a/plugins/device/cmd/nvidia/fingerprint.go +++ b/devices/gpu/nvidia/fingerprint.go @@ -5,8 +5,8 @@ import ( "fmt" "time" + "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" ) const ( @@ -26,11 +26,16 @@ const ( func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) { defer close(devices) - if d.nvmlClientInitializationError != nil { - d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.nvmlClientInitializationError) - // write empty fingerprint response to let server know that there are - // no working Nvidia GPU units - devices <- device.NewFingerprint() + if d.initErr != nil { + if d.initErr.Error() != nvml.UnavailableLib.Error() { + d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr) + devices <- device.NewFingerprintError(d.initErr) + } else { + // write empty fingerprint response to let server know that there are + // no working Nvidia GPU units + devices <- device.NewFingerprint() + } + return } @@ -51,7 +56,6 @@ func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.F // writeFingerprintToChannel makes nvml call and writes response to channel func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) { fingerprintData, err := d.nvmlClient.GetFingerprintData() - if err != nil { d.logger.Error("failed to get fingerprint nvidia devices", "error", err) devices <- device.NewFingerprintError(err) diff --git a/plugins/device/cmd/nvidia/fingerprint_test.go b/devices/gpu/nvidia/fingerprint_test.go similarity index 93% rename from plugins/device/cmd/nvidia/fingerprint_test.go rename to devices/gpu/nvidia/fingerprint_test.go index b181ebd65..b957d871f 100644 --- a/plugins/device/cmd/nvidia/fingerprint_test.go +++ b/devices/gpu/nvidia/fingerprint_test.go @@ -7,9 +7,9 @@ import ( "testing" hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" "github.com/stretchr/testify/require" ) @@ -197,8 +197,10 @@ func TestIgnoreFingerprintedDevices(t *testing.T) { ExpectedResult: nil, }, } { - actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds) - require.New(t).Equal(testCase.ExpectedResult, actualResult) + t.Run(testCase.Name, func(t *testing.T) { + actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + }) } } @@ -342,12 +344,14 @@ func TestCheckFingerprintUpdates(t *testing.T) { DeviceMapAfterMethodCall: map[string]struct{}{}, }, } { - actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices) - req := require.New(t) - // check that function returns valid "updated / not updated" state - req.Equal(testCase.ExpectedResult, actualResult) - // check that function propely updates devices map - req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall) + t.Run(testCase.Name, func(t *testing.T) { + actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices) + req := require.New(t) + // check that function returns valid "updated / not updated" state + req.Equal(testCase.ExpectedResult, actualResult) + // check that function propely updates devices map + req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall) + }) } } @@ -554,8 +558,10 @@ func TestAttributesFromFingerprintDeviceData(t *testing.T) { }, }, } { - actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData) - require.New(t).Equal(testCase.ExpectedResult, actualResult) + t.Run(testCase.Name, func(t *testing.T) { + actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + }) } } @@ -715,8 +721,10 @@ func TestDeviceGroupFromFingerprintData(t *testing.T) { ExpectedResult: nil, }, } { - actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes) - require.New(t).Equal(testCase.ExpectedResult, actualResult) + t.Run(testCase.Name, func(t *testing.T) { + actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes) + require.New(t).Equal(testCase.ExpectedResult, actualResult) + }) } } @@ -1070,20 +1078,22 @@ func TestWriteFingerprintToChannel(t *testing.T) { }, }, } { - channel := make(chan *device.FingerprintResponse, 1) - testCase.Device.writeFingerprintToChannel(channel) - actualResult := <-channel - // writeFingerprintToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedResult arrays has to be sorted firsted - sort.Slice(actualResult.Devices, func(i, j int) bool { - return actualResult.Devices[i].Name < actualResult.Devices[j].Name + t.Run(testCase.Name, func(t *testing.T) { + channel := make(chan *device.FingerprintResponse, 1) + testCase.Device.writeFingerprintToChannel(channel) + actualResult := <-channel + // writeFingerprintToChannel iterates over map keys + // and insterts results to an array, so order of elements in output array + // may be different + // actualResult, expectedResult arrays has to be sorted firsted + sort.Slice(actualResult.Devices, func(i, j int) bool { + return actualResult.Devices[i].Name < actualResult.Devices[j].Name + }) + sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool { + return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name + }) + require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) }) - sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name - }) - require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) } } @@ -1097,7 +1107,7 @@ func TestFingerprint(t *testing.T) { { Name: "Check that working driver returns valid fingeprint data", Device: &NvidiaDevice{ - nvmlClientInitializationError: nil, + initErr: nil, nvmlClient: &MockNvmlClient{ FingerprintResponseReturned: &nvml.FingerprintData{ DriverVersion: "1", @@ -1197,9 +1207,9 @@ func TestFingerprint(t *testing.T) { }, }, { - Name: "Check that not working driver returns empty fingeprint data", + Name: "Check that not working driver returns error fingeprint data", Device: &NvidiaDevice{ - nvmlClientInitializationError: errors.New(""), + initErr: errors.New("foo"), nvmlClient: &MockNvmlClient{ FingerprintResponseReturned: &nvml.FingerprintData{ DriverVersion: "1", @@ -1230,14 +1240,18 @@ func TestFingerprint(t *testing.T) { }, logger: hclog.NewNullLogger(), }, - ExpectedWriteToChannel: &device.FingerprintResponse{}, + ExpectedWriteToChannel: &device.FingerprintResponse{ + Error: errors.New("foo"), + }, }, } { - outCh := make(chan *device.FingerprintResponse) - ctx, cancel := context.WithCancel(context.Background()) - go testCase.Device.fingerprint(ctx, outCh) - result := <-outCh - cancel() - require.New(t).Equal(result, testCase.ExpectedWriteToChannel) + t.Run(testCase.Name, func(t *testing.T) { + outCh := make(chan *device.FingerprintResponse) + ctx, cancel := context.WithCancel(context.Background()) + go testCase.Device.fingerprint(ctx, outCh) + result := <-outCh + cancel() + require.New(t).Equal(result, testCase.ExpectedWriteToChannel) + }) } } diff --git a/plugins/device/cmd/nvidia/nvml/client.go b/devices/gpu/nvidia/nvml/client.go similarity index 100% rename from plugins/device/cmd/nvidia/nvml/client.go rename to devices/gpu/nvidia/nvml/client.go diff --git a/plugins/device/cmd/nvidia/nvml/client_test.go b/devices/gpu/nvidia/nvml/client_test.go similarity index 100% rename from plugins/device/cmd/nvidia/nvml/client_test.go rename to devices/gpu/nvidia/nvml/client_test.go diff --git a/devices/gpu/nvidia/nvml/driver_default.go b/devices/gpu/nvidia/nvml/driver_default.go new file mode 100644 index 000000000..e67efa22e --- /dev/null +++ b/devices/gpu/nvidia/nvml/driver_default.go @@ -0,0 +1,33 @@ +// +build !linux + +package nvml + +// Initialize nvml library by locating nvml shared object file and calling ldopen +func (n *nvmlDriver) Initialize() error { + return UnavailableLib +} + +// Shutdown stops any further interaction with nvml +func (n *nvmlDriver) Shutdown() error { + return UnavailableLib +} + +// SystemDriverVersion returns installed driver version +func (n *nvmlDriver) SystemDriverVersion() (string, error) { + return "", UnavailableLib +} + +// DeviceCount reports number of available GPU devices +func (n *nvmlDriver) DeviceCount() (uint, error) { + return 0, UnavailableLib +} + +// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list +func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { + return nil, UnavailableLib +} + +// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list +func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { + return nil, nil, UnavailableLib +} diff --git a/plugins/device/cmd/nvidia/nvml/driver.go b/devices/gpu/nvidia/nvml/driver_linux.go similarity index 61% rename from plugins/device/cmd/nvidia/nvml/driver.go rename to devices/gpu/nvidia/nvml/driver_linux.go index ef1ba57c4..bdd777561 100644 --- a/plugins/device/cmd/nvidia/nvml/driver.go +++ b/devices/gpu/nvidia/nvml/driver_linux.go @@ -4,59 +4,6 @@ import ( "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" ) -// DeviceInfo represents nvml device data -// this struct is returned by NvmlDriver DeviceInfoByIndex and -// DeviceInfoAndStatusByIndex methods -type DeviceInfo struct { - // The following fields are guaranteed to be retrieved from nvml - UUID string - PCIBusID string - DisplayState string - PersistenceMode string - - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - Name *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint -} - -// DeviceStatus represents nvml device status -// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method -type DeviceStatus struct { - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - PowerUsageW *uint - TemperatureC *uint - GPUUtilization *uint // % - MemoryUtilization *uint // % - EncoderUtilization *uint // % - DecoderUtilization *uint // % - BAR1UsedMiB *uint64 - UsedMemoryMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} - -// NvmlDriver represents set of methods to query nvml library -type NvmlDriver interface { - Initialize() error - Shutdown() error - SystemDriverVersion() (string, error) - DeviceCount() (uint, error) - DeviceInfoByIndex(uint) (*DeviceInfo, error) - DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) -} - -// nvmlDriver implements NvmlDriver -// Users are required to call Initialize method before using any other methods -type nvmlDriver struct{} - // Initialize nvml library by locating nvml shared object file and calling ldopen func (n *nvmlDriver) Initialize() error { return nvml.Init() diff --git a/devices/gpu/nvidia/nvml/shared.go b/devices/gpu/nvidia/nvml/shared.go new file mode 100644 index 000000000..a0bb04d22 --- /dev/null +++ b/devices/gpu/nvidia/nvml/shared.go @@ -0,0 +1,61 @@ +package nvml + +import "errors" + +var ( + // UnavailableLib is returned when the nvml library could not be loaded. + UnavailableLib = errors.New("could not load NVML library") +) + +// nvmlDriver implements NvmlDriver +// Users are required to call Initialize method before using any other methods +type nvmlDriver struct{} + +// NvmlDriver represents set of methods to query nvml library +type NvmlDriver interface { + Initialize() error + Shutdown() error + SystemDriverVersion() (string, error) + DeviceCount() (uint, error) + DeviceInfoByIndex(uint) (*DeviceInfo, error) + DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) +} + +// DeviceInfo represents nvml device data +// this struct is returned by NvmlDriver DeviceInfoByIndex and +// DeviceInfoAndStatusByIndex methods +type DeviceInfo struct { + // The following fields are guaranteed to be retrieved from nvml + UUID string + PCIBusID string + DisplayState string + PersistenceMode string + + // The following fields can be nil after call to nvml, because nvml was + // not able to retrieve this fields for specific nvidia card + Name *string + MemoryMiB *uint64 + PowerW *uint + BAR1MiB *uint64 + PCIBandwidthMBPerS *uint + CoresClockMHz *uint + MemoryClockMHz *uint +} + +// DeviceStatus represents nvml device status +// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method +type DeviceStatus struct { + // The following fields can be nil after call to nvml, because nvml was + // not able to retrieve this fields for specific nvidia card + PowerUsageW *uint + TemperatureC *uint + GPUUtilization *uint // % + MemoryUtilization *uint // % + EncoderUtilization *uint // % + DecoderUtilization *uint // % + BAR1UsedMiB *uint64 + UsedMemoryMiB *uint64 + ECCErrorsL1Cache *uint64 + ECCErrorsL2Cache *uint64 + ECCErrorsDevice *uint64 +} diff --git a/plugins/device/cmd/nvidia/stats.go b/devices/gpu/nvidia/stats.go similarity index 97% rename from plugins/device/cmd/nvidia/stats.go rename to devices/gpu/nvidia/stats.go index 022c710fc..f433e7175 100644 --- a/plugins/device/cmd/nvidia/stats.go +++ b/devices/gpu/nvidia/stats.go @@ -4,8 +4,8 @@ import ( "context" "time" + "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" ) const ( @@ -53,8 +53,12 @@ const ( func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse) { defer close(stats) - if d.nvmlClientInitializationError != nil { - d.logger.Error("exiting stats due to problems with NVML loading", "error", d.nvmlClientInitializationError) + if d.initErr != nil { + if d.initErr.Error() != nvml.UnavailableLib.Error() { + d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr) + stats <- device.NewStatsError(d.initErr) + } + return } diff --git a/plugins/device/cmd/nvidia/stats_test.go b/devices/gpu/nvidia/stats_test.go similarity index 99% rename from plugins/device/cmd/nvidia/stats_test.go rename to devices/gpu/nvidia/stats_test.go index d60eb88da..c975b4c4c 100644 --- a/plugins/device/cmd/nvidia/stats_test.go +++ b/devices/gpu/nvidia/stats_test.go @@ -7,9 +7,9 @@ import ( "time" hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/device/cmd/nvidia/nvml" "github.com/stretchr/testify/require" ) diff --git a/plugins/device/device.go b/plugins/device/device.go index 53dab5c39..82cdec650 100644 --- a/plugins/device/device.go +++ b/plugins/device/device.go @@ -141,6 +141,13 @@ type StatsResponse struct { Error error } +// NewStatsError takes an error and returns a stats response +func NewStatsError(err error) *StatsResponse { + return &StatsResponse{ + Error: err, + } +} + // DeviceGroupStats contains statistics for each device of a particular // device group, identified by the vendor, type and name of the device. type DeviceGroupStats struct {